summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--PendingReleaseNotes38
-rw-r--r--configure.ac2
-rw-r--r--debian/changelog6
-rw-r--r--doc/changelog/v0.67.4.txt550
-rw-r--r--doc/release-notes.rst87
-rw-r--r--fusetrace/fusetrace_ll.cc2
-rwxr-xr-xqa/workunits/rbd/copy.sh12
-rwxr-xr-xqa/workunits/rbd/import_export.sh8
-rw-r--r--src/client/Client.cc109
-rw-r--r--src/client/Client.h17
-rw-r--r--src/client/fuse_ll.cc20
-rw-r--r--src/common/bloom_filter.cc95
-rw-r--r--src/common/bloom_filter.hpp181
-rw-r--r--src/common/config_opts.h13
-rw-r--r--src/common/hobject.h4
-rw-r--r--src/mon/Monitor.cc206
-rw-r--r--src/mon/Monitor.h2
-rw-r--r--src/mon/MonmapMonitor.cc39
-rw-r--r--src/mon/PGMap.cc58
-rw-r--r--src/mon/PGMap.h37
-rw-r--r--src/mon/PGMonitor.cc24
-rw-r--r--src/os/FileStore.cc189
-rw-r--r--src/os/FileStore.h15
-rw-r--r--src/osd/PG.cc37
-rw-r--r--src/osd/PG.h5
-rw-r--r--src/osd/PGBackend.h20
-rw-r--r--src/osd/PGLog.cc4
-rw-r--r--src/osd/ReplicatedBackend.cc72
-rw-r--r--src/osd/ReplicatedBackend.h20
-rw-r--r--src/osd/ReplicatedPG.cc363
-rw-r--r--src/osd/ReplicatedPG.h113
-rw-r--r--src/rbd_fuse/rbd-fuse.c2
-rw-r--r--src/test/ObjectMap/test_store_tool/test_store_tool.cc90
-rw-r--r--src/test/cli-integration/rbd/formatted-output.t22
-rw-r--r--src/test/common/test_bloom_filter.cc71
-rw-r--r--src/test/encoding/types.h1
-rw-r--r--src/test/osd/RadosModel.h5
-rw-r--r--src/test/osd/TestRados.cc9
38 files changed, 2027 insertions, 521 deletions
diff --git a/PendingReleaseNotes b/PendingReleaseNotes
index 9a751ffdb49..a3ec73290f3 100644
--- a/PendingReleaseNotes
+++ b/PendingReleaseNotes
@@ -1,37 +1,3 @@
-v0.70
-~~~~~
-
-* librados::Rados::pool_create_async() and librados::Rados::pool_delete_async()
- don't drop a reference to the completion object on error, caller needs to take
- care of that. This has never really worked correctly and we were leaking an
- object
-
-* 'ceph osd crush set <id> <weight> <loc..>' no longer adds the osd to the
- specified location, as that's a job for 'ceph osd crush add'. It will
- however continue to work just the same as long as the osd already exists
- in the crush map.
-
-* The OSD now enforces that class write methods cannot both mutate an
- object and return data. The rbd.assign_bid method, the lone
- offender, has been removed. This breaks compatibility with
- pre-bobtail librbd clients by preventing them from creating new
- images.
-
-* librados now returns on commit instead of ack for synchronous calls.
- This is a bit safer in the case where both OSDs and the client crash, and
- is probably how it should have been acting from the beginning. Users are
- unlikely to notice but it could result in lower performance in some
- circumstances. Those who care should switch to using the async interfaces,
- which let you specify safety semantics precisely.
-
-* The C++ librados AioComplete::get_version() method was incorrectly
- returning an int (usually 32-bits). To avoid breaking library
- compatibility, a get_version64() method is added that returns the
- full-width value. The old method is deprecated and will be removed
- in a future release. Users of the C++ librados API that make use of
- the get_version() method should modify their code to avoid getting a
- value that is truncated from 64 to to 32 bits.
-
v0.71
~~~~~
@@ -51,3 +17,7 @@ v0.71
* Any direct users of the 'tmap' portion of the librados API should be
aware that the automatic tmap -> omap conversion functionality has
been removed.
+
+* Most output that used K or KB (e.g., for kilobyte) now uses a
+ lower-case k to match the official SI convention. Any scripts that
+ parse output and check for an upper-case K will need to be modified.
diff --git a/configure.ac b/configure.ac
index eeecdbeffc8..1eee4609ec1 100644
--- a/configure.ac
+++ b/configure.ac
@@ -8,7 +8,7 @@ AC_PREREQ(2.59)
# VERSION define is not used by the code. It gets a version string
# from 'git describe'; see src/ceph_ver.[ch]
-AC_INIT([ceph], [0.69], [ceph-devel@vger.kernel.org])
+AC_INIT([ceph], [0.70], [ceph-devel@vger.kernel.org])
# Create release string. Used with VERSION for RPMs.
RPM_RELEASE=0
diff --git a/debian/changelog b/debian/changelog
index ce73472f9eb..4628bb52175 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,3 +1,9 @@
+ceph (0.70-1) stable; urgency=low
+
+ * New upstream release
+
+ -- Gary Lowell <gary.lowell@inktank.com> Fri, 04 Oct 2013 20:11:51 +0000
+
ceph (0.69-1) precise; urgency=low
* New upstream release
diff --git a/doc/changelog/v0.67.4.txt b/doc/changelog/v0.67.4.txt
new file mode 100644
index 00000000000..73b997ea304
--- /dev/null
+++ b/doc/changelog/v0.67.4.txt
@@ -0,0 +1,550 @@
+commit ad85b8bfafea6232d64cb7ba76a8b6e8252fa0c7
+Author: Gary Lowell <gary.lowell@inktank.com>
+Date: Thu Oct 3 22:41:31 2013 +0000
+
+ v0.67.4
+
+commit 5cd66d3b4bca92b402c95ab256fbc3f0329c446f
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date: Fri Sep 20 14:04:47 2013 -0700
+
+ rgw: fix keystone token expiration test
+
+ Fixes: #6360
+ The test was inverted, need expiration to be greater than
+ current time in order for token to be valid.
+
+ Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+ Reviewed-by: Sage Weil <sage@inktank.com>
+
+commit e0203c61a3f45fdd6d3d3ece26fef6152bdc036d
+Author: David Zafman <david.zafman@inktank.com>
+Date: Wed Sep 11 16:55:06 2013 -0700
+
+ osd/OSD.cc: Use MIN() so that we don't exceed osd_recovery_max_active
+
+ Caused by 944f3b73531af791c90f0f061280160003545c63
+
+ Fixes: #6291
+
+ Backport: dumpling
+
+ Signed-off-by: David Zafman <david.zafman@inktank.com>
+ Reviewed-by: Samuel Just <sam.just@inktank.com>
+ (cherry picked from commit 139a714e13aa3c7f42091270b55dde8a17b3c4b8)
+
+ Conflicts:
+
+ src/osd/OSD.cc
+
+commit c376708358cedb5561fbb43e9b9e622df3ea7a58
+Author: Joao Eduardo Luis <joao.luis@inktank.com>
+Date: Wed Sep 25 22:08:24 2013 +0100
+
+ mon: OSDMonitor: do not write full_latest during trim
+
+ On commit 81983bab we patched OSDMonitor::update_from_paxos() such that we
+ write the latest full map version to 'full_latest' each time the latest
+ full map was built from the incremental versions.
+
+ This change however clashed with OSDMonitor::encode_trim_extra(), which
+ also wrote to 'full_latest' on each trim, writing instead the version of
+ the *oldest* full map. This duality of behaviors could lead the store
+ to an inconsistent state across the monitors (although there's no sign of
+ it actually imposing any issues besides rebuilding already existing full
+ maps on some monitors).
+
+ We now stop OSDMonitor::encode_trim_extra() from writing to 'full_latest'.
+ This function will still write out the oldest full map it has in the store,
+ but it will no longer write to full_latest, instead leaving it up to
+ OSDMonitor::update_from_paxos() to figure it out -- and it already does.
+
+ Fixes: #6378
+
+ Backport: dumpling
+
+ Signed-off-by: Joao Eduardo Luis <joao.luis@inktank.com>
+ Reviewed-by: Sage Weil <sage@inktank.com>
+ (cherry picked from commit bd0f29a2c28cca496ec830eac932477ebf3182ba)
+
+commit de40d0b3e35ab0124cd3c4ebfcaa435ab8abfab9
+Author: Sage Weil <sage@inktank.com>
+Date: Tue Oct 1 15:53:42 2013 -0700
+
+ crush: invalidate rmap on create (and thus decode)
+
+ If we have an existing CrushWrapper object and decode from a bufferlist,
+ reset build_rmaps so that they get rebuilt.
+
+ Remove the build_rmaps() all in decode that was useless on a redecode
+ (because have_rmaps == true in that case and it did nothing).
+
+ Fixes: #6442
+ Backport: dumpling, maybe cuttlefish
+ Signed-off-by: Sage Weil <sage@inktank.com>
+ Reviewed-by: Joao Eduardo Luis <joao.luis@inktank.com>
+ (cherry picked from commit 9b7a2ae329b6a511064dd3d6e549ba61f52cfd21)
+
+commit 32f5233288c47d95b87c0a9cab5f9c2ffcf15417
+Author: Dan Mick <dan.mick@inktank.com>
+Date: Mon Sep 30 14:58:11 2013 -0700
+
+ Invoke python with /usr/bin/env python instead of directly
+
+ Fixes: #6311
+ Signed-off-by: Dan Mick <dan.mick@inktank.com>
+ (cherry picked from commit b9000b314b9166845ff302d4a827a996775d9a14)
+
+commit 66aeca5a9079be398403bbff67bd5bf68c6fb111
+Author: Sage Weil <sage@inktank.com>
+Date: Wed Sep 25 10:10:21 2013 -0700
+
+ qa/workunits/mon/crush_ops.sh: fix test
+
+ Fix root.
+
+ Fixes: #6392
+ Signed-off-by: Sage Weil <sage@inktank.com>
+ (cherry picked from commit c8cae87e9e08468cc86145e0fd60c05d12826239)
+
+commit beb366302a125dd422c4f092b12eb541cb3bc788
+Author: Sage Weil <sage@inktank.com>
+Date: Mon Sep 23 09:04:34 2013 -0700
+
+ Revert "ceph: parse CEPH_ARGS environment variable"
+
+ This reverts commit 67a95b9880c9bc6e858150352318d68d64ed74ad.
+
+ We now put CEPH_ARGS in the actual args we parse in python, which are passed
+ to rados piecemeal later. This lets you put things like --id ... in there
+ that need to be parsed before librados is initialized.
+ (cherry picked from commit 97f462be4829f0167ed3d65e6694dfc16f1f3243)
+
+commit b475ff9576f145d31c053213c699e13df76d2bcb
+Author: Benoît Knecht <benoit.knecht@fsfe.org>
+Date: Mon Sep 23 15:58:42 2013 +0200
+
+ Add CEPH_ARGS at the end of sys.argv
+
+ This allows, for instance, to pass a different client name to ceph by
+ exporting CEPH_ARGS="--id client_id".
+
+ Signed-off-by: Benoît Knecht <benoit.knecht@fsfe.org>
+ Signed-off-by: Sage Weil <sage@inktank.com>
+ (cherry picked from commit 30abe3244c86cbbe1f5b005850c29c9c0eafcad4)
+
+commit 94548b4b67cca37366c7d8719209a6d2e7956811
+Author: Sage Weil <sage@inktank.com>
+Date: Tue Sep 24 15:26:03 2013 -0700
+
+ mon/OSDMonitor: fix 'ceph osd crush reweight ...'
+
+ The adjust method returns a count of adjusted items.
+
+ Add a test.
+
+ Fixes: #6382
+ Backport: dumpling
+ Signed-off-by: Sage Weil <sage@inktank.com>
+ Reviewed-by: Dan Mick <dan.mick@inktank.com>
+ (cherry picked from commit 3de32562b55c6ece3a6ed783c36f8b9f21460339)
+
+commit 00ff7f5c20e13869d0694379739ba4e61d44b97c
+Author: Joao Eduardo Luis <joao.luis@inktank.com>
+Date: Tue Sep 10 00:20:41 2013 +0100
+
+ qa: workunits: mon: crush_ops: test 'ceph osd crush move'
+
+ Signed-off-by: Joao Eduardo Luis <joao.luis@inktank.com>
+ (cherry picked from commit 3bc618b7b46496c5110edde0da9cae5d3e68e0e1)
+
+commit 0ff5b4a96833681e92cc41f019a569134474f4cf
+Author: Loic Dachary <loic@dachary.org>
+Date: Tue Sep 24 19:04:23 2013 +0200
+
+ osd: change warn_interval_multiplier to uint32_t
+
+ to prevent overflow in OpTracker::check_ops_in_flight when
+ multiplying warn_interval_multiplier *= 2
+
+ Backport: cuttlefish, dumpling
+
+ http://tracker.ceph.com/issues/6370 fixes #6370
+
+ Signed-off-by: Loic Dachary <loic@dachary.org>
+ (cherry picked from commit 1bce1f009bffd3e28025a08775fec189907a81db)
+
+commit fb15040b6cec6221baa550ddfffade823f784c4a
+Author: David Zafman <david.zafman@inktank.com>
+Date: Mon Sep 9 13:01:12 2013 -0700
+
+ crushtool: do not dump core with non-unique bucket IDs
+
+ Return -EEXIST on duplicate ID
+ BUG FIX: crush_add_bucket() mixes error returns and IDs
+ Add optional argument to return generated ID
+
+ Fixes: #6246
+
+ Signed-off-by: David Zafman <david.zafman@inktank.com>
+ Reviewed-by: Sage Weil <sage@inktank.com>
+ (cherry picked from commit 8c76f3a0f9cf100ea2c941dc2b61c470aa5033d7)
+
+commit 410db3f30c6eb54b807908c1f251ad4026e7d446
+Author: Joao Eduardo Luis <jecluis@gmail.com>
+Date: Fri Sep 20 17:06:30 2013 +0100
+
+ qa: workunits: cephtool: check if 'heap' commands are parseable
+
+ Signed-off-by: Joao Eduardo Luis <jecluis@gmail.com>
+ (cherry picked from commit b1eeaddd5f214c1b0883b44fc8cae07c649be7c4)
+
+commit 062060a38bb26ff260cc51accc534413d726de49
+Author: Joao Eduardo Luis <jecluis@gmail.com>
+Date: Fri Sep 20 17:50:27 2013 +0100
+
+ osd: OSD: add 'heap' command to known osd commands array
+
+ Must have been forgotten during the cli rework.
+
+ Backport: dumpling
+
+ Signed-off-by: Joao Eduardo Luis <jecluis@gmail.com>
+ (cherry picked from commit 296f2d0db31e9f5a59a3a62a1e95b6c440430fa3)
+
+commit 3f32f57b98e0224a1d30b2a81d7d260be0f53800
+Author: Joao Eduardo Luis <jecluis@gmail.com>
+Date: Fri Sep 20 16:43:27 2013 +0100
+
+ mds: MDS: pass only heap profiler commands instead of the whole cmd vector
+
+ The heap profiler doesn't care, nor should it, what our command name is.
+ It only cares about the commands it handles.
+
+ Backport: dumpling
+
+ Signed-off-by: Joao Eduardo Luis <jecluis@gmail.com>
+ (cherry picked from commit 238fe272c6bdb62d4e57fd8555c0136de99c8129)
+
+commit 46dcc46617d8f35ab8433540b22343ddcbcc3716
+Author: Joao Eduardo Luis <jecluis@gmail.com>
+Date: Fri Sep 20 16:41:14 2013 +0100
+
+ perfglue/heap_profiler.cc: expect args as first element on cmd vector
+
+ We used to pass 'heap' as the first element of the cmd vector when
+ handling commands. We haven't been doing so for a while now, so we
+ needed to fix this.
+
+ Not expecting 'heap' also makes sense, considering that what we need to
+ know when we reach this function is what command we should handle, and
+ we should not care what the caller calls us when handling his business.
+
+ Fixes: #6361
+ Backport: dumpling
+
+ Signed-off-by: Joao Eduardo Luis <jecluis@gmail.com>
+ (cherry picked from commit c98b910d49bd2b46ceafdc430044a31524c29f5b)
+
+commit 9dc5f15fbae22244ad1f62925e17c9d81e856e55
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date: Mon Sep 16 14:35:25 2013 -0700
+
+ rgw: destroy get_obj handle in copy_obj()
+
+ Fixes: #6176
+ Backport: dumpling
+ We take different code paths in copy_obj, make sure we close the handle
+ when we exit the function. Move the call to finish_get_obj() out of
+ copy_obj_data() as we don't create the handle there, so that should
+ makes code less confusing and less prone to errors.
+ Also, note that RGWRados::get_obj() also calls finish_get_obj(). For
+ everything to work in concert we need to pass a pointer to the handle
+ and not the handle itself. Therefore we needed to also change the call
+ to copy_obj_data().
+
+ Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
+ Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+ (cherry picked from commit 9e98620e4325d15c88440a890b267131613e1aa1)
+
+commit 471233e98a9f64ad513a4a196b7661b80534cb00
+Author: Joao Eduardo Luis <joao.luis@inktank.com>
+Date: Mon Sep 9 23:14:11 2013 +0100
+
+ mon: MonCommands: expect a CephString as 1st arg for 'osd crush move'
+
+ Fixes: #6230
+
+ Signed-off-by: Joao Eduardo Luis <joao.luis@inktank.com>
+ (cherry picked from commit 7d3799fde19138f957f26ec6be10a8a0000fc1f0)
+
+commit 2908225092bd2aa1b8afcb7848c1cdac5bd9e638
+Author: Sage Weil <sage@inktank.com>
+Date: Mon Sep 23 16:23:33 2013 -0700
+
+ osd: revert 'osd max xattr size' limit
+
+ Set it to 0 (unlimited) for now.
+
+ Backport: dumpling
+
+ Signed-off-by: Sage Weil <sage@inktank.com>
+ Reviewed-by: Yehuda Sadeh <yehuda@inktank.com>
+ (cherry picked from commit abb88d70643c3a76435b7a9d5b04ff29f7502361)
+
+commit b3d3b3747c1eef695138dac828e5fcb435309c7b
+Author: Greg Farnum <greg@inktank.com>
+Date: Wed Sep 11 16:24:32 2013 -0700
+
+ mds: be more careful about decoding LogEvents
+
+ We need to wrap the full decode section or we can abort the process
+ if there's an issue (which we may want to just skip by).
+
+ Signed-off-by: Greg Farnum <greg@inktank.com>
+ Reviewed-by: Sage Weil <sage@inktank.com>
+ (cherry picked from commit 73289b34b0be5b6612e38944794d59b5e789f841)
+
+commit 06c58132199ed22413b509dfa751321ccdb24225
+Author: Joao Eduardo Luis <joao.luis@inktank.com>
+Date: Tue Sep 17 17:58:20 2013 +0100
+
+ mon: OSDMonitor: multiple rebuilt full maps per transaction
+
+ Signed-off-by: Joao Eduardo Luis <joao.luis@inktank.com>
+ (cherry picked from commit 0d20cae0be701c5b6151a26ee5e4fe24d89aa20a)
+
+commit 65bbcaf4b68790dae4506c1f5db237077e1ff0ae
+Author: Joao Eduardo Luis <joao.luis@inktank.com>
+Date: Sun Sep 15 21:03:50 2013 +0100
+
+ mon: OSDMonitor: update latest_full while rebuilding full maps
+
+ Not doing so will make the monitor rebuild the osdmap full versions, even
+ though they may have been rebuilt before, every time the monitor starts.
+
+ This mostly happens when the cluster is left in an unhealthy state for
+ a long period of time and incremental versions build up. Even though we
+ build the full maps on update_from_paxos(), not updating 'full_latest'
+ leads to the situation initially described.
+
+ Fixes: #6322
+
+ Signed-off-by: Joao Eduardo Luis <joao.luis@inktank.com>
+ (cherry picked from commit 81983bab3630520d6c7ee9b7e4a747bc17b8c5c3)
+
+commit 9b9edb04581cca15e67c567332529f5b3f426743
+Author: Joao Eduardo Luis <joao.luis@inktank.com>
+Date: Sun Sep 15 21:00:55 2013 +0100
+
+ mon: OSDMonitor: smaller transactions when rebuilding full versions
+
+ Otherwise, for considerably sized rebuilds, the monitor will not only
+ consume vast amounts of memory, but it will also have troubles committing
+ the transaction. Anyway, it's also a good idea to adjust transactions to
+ the granularity we want, and to be fair we care that each rebuilt full map
+ gets to disk, even if subsequent full maps don't (those can be rebuilt
+ later).
+
+ Fixes: #6323
+
+ Signed-off-by: Joao Eduardo Luis <joao.luis@inktank.com>
+ (cherry picked from commit 4ac1570c5cdcd6556dc291cc6d7878fd92d343ae)
+
+commit 298811f7a15541b9ec1015c416ad2aa075be5691
+Author: Joao Eduardo Luis <jecluis@gmail.com>
+Date: Wed Aug 28 15:51:01 2013 +0100
+
+ mon: OSDMonitor: check if pool is on unmanaged snaps mode on mk/rmsnap
+
+ Backport: dumpling
+ Fixes: #6047
+
+ Signed-off-by: Joao Eduardo Luis <jecluis@gmail.com>
+ (cherry picked from commit fab79543c54c2e446d3f76520d7906645c6b0075)
+
+commit a992664435db9dde3745eb7f354cce3fc5400a47
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date: Thu Sep 12 14:32:17 2013 -0700
+
+ lru_map: don't use list::size()
+
+ replace list::size() with map::size(), which should have
+ a constant time complexity.
+
+ Reviewed-by: Sage Weil <sage@inktank.com>
+ Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+ (cherry picked from commit 7c1d2ded8fa8061bf3f14932800998b963745dd1)
+
+commit 788546ea71c994ff35323747294ed9c177fe7020
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date: Thu Sep 12 14:30:19 2013 -0700
+
+ common/lru_map: rename tokens to entries
+
+ This code was originally used in a token cache, now
+ as a generic infrastructure rename token fields.
+
+ Reviewed-by: Sage Weil <sage@inktank.com>
+ Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+ (cherry picked from commit 532e41a9985a16b35a6e49cdcba38af0ad166fa8)
+
+commit babeb00c42af760b3e7575166479e95365cfcc0a
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date: Wed Sep 18 10:37:21 2013 -0700
+
+ rgw: use bufferlist::append() instead of bufferlist::push_back()
+
+ push_back() expects char *, whereas append can append a single char.
+ Appending a NULL char to push_back is cast as a NULL pointer which is
+ bad.
+
+ Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+ Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
+ (cherry picked from commit 08fe028bad13096d482454a2f303158727c363ff)
+
+commit daf85c45dd4d158bc7c33a2fb784857bc7db35cd
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date: Wed Sep 11 13:46:31 2013 -0700
+
+ rgw: NULL terminate buffer before parsing it
+
+ Fixes: #6175
+ Backport: dumpling
+ We get a buffer off the remote gateway which might
+ not be NULL terminated. The JSON parser needs the
+ buffer to be NULL terminated even though we provide
+ a buffer length as it calls strlen().
+
+ Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
+ Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+ (cherry picked from commit e7f7483192cddca1159aba439ce62b1e78669d51)
+
+commit c73040a5518971813b9ebaae1624c5bacef315d0
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date: Wed Sep 11 22:30:12 2013 -0700
+
+ rgw: don't call list::size() in ObjectCache
+
+ Fixes: #6286
+ Use an external counter instead of calling list::size()
+
+ Reviewed-by: Sage Weil <sage@inktank.com>
+ Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+ (cherry picked from commit 31e3a51e933429d286104fe077e98ea883437ad6)
+
+commit a855aba9d18936e9a060119e041518790cd4b831
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date: Tue Sep 10 12:18:55 2013 -0700
+
+ rgw: drain pending requests before completing write
+
+ Fixes: #6268
+ When doing aio write of objects (either regular or multipart parts) we
+ need to drain pending aio requests. Otherwise if gateway goes down then
+ object might end up corrupted.
+
+ Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
+ Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+ (cherry picked from commit 626669afaa333d73707553a85f5c874e99e9cbd8)
+
+commit 670db7e80ddc9c26c43a4f66907a5996ce207c4d
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date: Fri Sep 6 22:33:38 2013 -0700
+
+ rgw: fix get cors, delete cors
+
+ Remove a couple of variables that overrode class member. Not
+ really clear how it was working before, might have been a bad
+ merge / rebase.
+
+ Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+ (cherry picked from commit 13872785aeeddbe1b8dd97e49fd6a2d879514f8d)
+
+commit a304016fa01b02efd500135c00b9bf3407a9999c
+Merge: 408cd61 ac0a30f
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date: Wed Sep 11 09:47:10 2013 -0700
+
+ Merge branch 'wip-6078-dumpling' into dumpling
+
+ Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
+
+commit ac0a30feb8c64a3b80d9c519a7b561213403afab
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date: Wed Aug 28 21:25:20 2013 -0700
+
+ rgw: fix certain return status cases in CORS
+
+ Change return values in certain cases, reorder
+ checks, etc.
+
+ Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+
+commit 13b28cc3f1eb8ef42875b630c485ee0105cd244a
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date: Wed Aug 28 21:24:36 2013 -0700
+
+ rgw: add COPY method to be handled by CORS
+
+ Was missing this http method.
+
+ Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+
+commit d45c87ea738807487e72c0719b0d3d459cbe19e9
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date: Tue Aug 27 19:38:45 2013 -0700
+
+ rgw: fix CORS rule check
+
+ Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+
+commit 986fa92a7a1d88111ba28457160adfcfdaabc5d2
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date: Tue Aug 27 19:38:18 2013 -0700
+
+ rgw: don't handle CORS if rule not found (is NULL)
+
+ Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+
+commit 71873aba6553492d3ad71596cefd7c841030a277
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date: Thu Aug 22 13:38:55 2013 -0700
+
+ rgw: tie CORS header response to all relevant operations
+
+ Have the CORS responses on all relevant operations. Also add headers
+ on failure cases.
+
+ Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+
+commit 94e7b594d85dbd26e58d823b41f418032e9f163f
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date: Thu Aug 22 10:00:53 2013 -0700
+
+ rgw: add a generic CORS response handling
+
+ Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+
+commit c3385d8a102faf5379559bb98cf89637ceda1579
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date: Wed Aug 21 17:22:46 2013 -0700
+
+ rgw: OPTIONS request doesn't need to read object info
+
+ This is a bucket-only operation, so we shouldn't look at the
+ object. Object may not exist and we might respond with Not
+ Exists response which is not what we want.
+
+ Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+
+commit a5fdd44e5d8ce4b8d82273d83e27aea19e63aa7c
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date: Wed Aug 21 14:43:28 2013 -0700
+
+ rgw: remove use of s->bucket_cors
+
+ Some old code still tried to use s->bucket_cors, which was
+ abandoned in a cleanup work.
+
+ Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
diff --git a/doc/release-notes.rst b/doc/release-notes.rst
index 604b4fa296b..2b566baa0ea 100644
--- a/doc/release-notes.rst
+++ b/doc/release-notes.rst
@@ -2,6 +2,37 @@
Release Notes
===============
+v0.70
+-----
+
+Upgrading
+~~~~~~~~~
+
+* librados::Rados::pool_create_async() and librados::Rados::pool_delete_async()
+ don't drop a reference to the completion object on error, caller needs to take
+ care of that. This has never really worked correctly and we were leaking an
+ object
+
+* 'ceph osd crush set <id> <weight> <loc..>' no longer adds the osd to the
+ specified location, as that's a job for 'ceph osd crush add'. It will
+ however continue to work just the same as long as the osd already exists
+ in the crush map.
+
+Notable Changes
+~~~~~~~~~~~~~~~
+
+* mon: a few 'ceph mon add' races fixed (command is now idempotent) (Joao Luis)
+* crush: fix name caching
+* rgw: fix a few minor memory leaks (Yehuda Sadeh)
+* ceph: improve parsing of CEPH_ARGS (Benoit Knecht)
+* mon: avoid rewriting full osdmaps on restart (Joao Luis)
+* crc32c: fix optimized crc32c code (it now detects arch support properly)
+* mon: fix 'ceph osd crush reweight ...' (Joao Luis)
+* osd: revert xattr size limit (fixes large rgw uploads)
+* mds: fix heap profiler commands (Joao Luis)
+* rgw: fix inefficient use of std::list::size() (Yehuda Sadeh)
+
+
v0.69
-----
@@ -19,6 +50,28 @@ Upgrading
the because the server-side behavior has changed it is possible that
an application misusing the interface may now get errors.
+* The OSD now enforces that class write methods cannot both mutate an
+ object and return data. The rbd.assign_bid method, the lone
+ offender, has been removed. This breaks compatibility with
+ pre-bobtail librbd clients by preventing them from creating new
+ images.
+
+* librados now returns on commit instead of ack for synchronous calls.
+ This is a bit safer in the case where both OSDs and the client crash, and
+ is probably how it should have been acting from the beginning. Users are
+ unlikely to notice but it could result in lower performance in some
+ circumstances. Those who care should switch to using the async interfaces,
+ which let you specify safety semantics precisely.
+
+* The C++ librados AioComplete::get_version() method was incorrectly
+ returning an int (usually 32-bits). To avoid breaking library
+ compatibility, a get_version64() method is added that returns the
+ full-width value. The old method is deprecated and will be removed
+ in a future release. Users of the C++ librados API that make use of
+ the get_version() method should modify their code to avoid getting a
+ value that is truncated from 64 to to 32 bits.
+
+
Notable Changes
~~~~~~~~~~~~~~~
@@ -120,6 +173,40 @@ Notable Changes
* sysvinit: add condrestart command (Dan van der Ster)
+
+v0.67.4 "Dumpling"
+------------------
+
+This point release fixes an important performance issue with radosgw,
+keystone authentication token caching, and CORS. All users
+(especially those of rgw) are encouraged to upgrade.
+
+Notable changes
+~~~~~~~~~~~~~~~
+
+* crush: fix invalidation of cached names
+* crushtool: do not crash on non-unique bucket ids
+* mds: be more careful when decoding LogEvents
+* mds: fix heap check debugging commands
+* mon: avoid rebuilding old full osdmaps
+* mon: fix 'ceph crush move ...'
+* mon: fix 'ceph osd crush reweight ...'
+* mon: fix writeout of full osdmaps during trim
+* mon: limit size of transactions
+* mon: prevent both unmanaged and pool snaps
+* osd: disable xattr size limit (prevents upload of large rgw objects)
+* osd: fix recovery op throttling
+* osd: fix throttling of log messages for very slow requests
+* rgw: drain pending requests before completing write
+* rgw: fix CORS
+* rgw: fix inefficient list::size() usage
+* rgw: fix keystone token expiration
+* rgw: fix minor memory leaks
+* rgw: fix null termination of buffer
+
+For more detailed information, see :download:`the complete changelog <changelog/v0.67.4.txt>`.
+
+
v0.67.3 "Dumpling"
------------------
diff --git a/fusetrace/fusetrace_ll.cc b/fusetrace/fusetrace_ll.cc
index eb7100a867f..7f2b8438f1f 100644
--- a/fusetrace/fusetrace_ll.cc
+++ b/fusetrace/fusetrace_ll.cc
@@ -11,7 +11,7 @@
gcc -Wall `pkg-config fuse --cflags --libs` -lulockmgr fusexmp_fh.c -o fusexmp_fh
*/
-#define FUSE_USE_VERSION 26
+#define FUSE_USE_VERSION 30
#ifdef HAVE_CONFIG_H
#include <config.h>
diff --git a/qa/workunits/rbd/copy.sh b/qa/workunits/rbd/copy.sh
index 8430fca7665..7abb3956c88 100755
--- a/qa/workunits/rbd/copy.sh
+++ b/qa/workunits/rbd/copy.sh
@@ -109,8 +109,8 @@ test_ls() {
rbd ls | grep test2
rbd ls | wc -l | grep 2
# look for fields in output of ls -l without worrying about space
- rbd ls -l | grep 'test1.*1024K.*1'
- rbd ls -l | grep 'test2.*1024K.*1'
+ rbd ls -l | grep 'test1.*1024k.*1'
+ rbd ls -l | grep 'test2.*1024k.*1'
rbd rm test1
rbd rm test2
@@ -120,8 +120,8 @@ test_ls() {
rbd ls | grep test1
rbd ls | grep test2
rbd ls | wc -l | grep 2
- rbd ls -l | grep 'test1.*1024K.*2'
- rbd ls -l | grep 'test2.*1024K.*2'
+ rbd ls -l | grep 'test1.*1024k.*2'
+ rbd ls -l | grep 'test2.*1024k.*2'
rbd rm test1
rbd rm test2
@@ -131,8 +131,8 @@ test_ls() {
rbd ls | grep test1
rbd ls | grep test2
rbd ls | wc -l | grep 2
- rbd ls -l | grep 'test1.*1024K.*2'
- rbd ls -l | grep 'test2.*1024K.*1'
+ rbd ls -l | grep 'test1.*1024k.*2'
+ rbd ls -l | grep 'test2.*1024k.*1'
remove_images
# test that many images can be shown by ls
diff --git a/qa/workunits/rbd/import_export.sh b/qa/workunits/rbd/import_export.sh
index 353a47fffbe..1813f7a9a88 100755
--- a/qa/workunits/rbd/import_export.sh
+++ b/qa/workunits/rbd/import_export.sh
@@ -66,7 +66,7 @@ dd if=/dev/urandom bs=1M count=1 of=/tmp/sparse2; truncate /tmp/sparse2 -s 2M
# 1M sparse, 1M data
rbd import $RBD_CREATE_ARGS --order 20 /tmp/sparse1
-rbd ls -l | grep sparse1 | grep '2048K'
+rbd ls -l | grep sparse1 | grep '2048k'
[ "$(objects sparse1)" = '1' ]
# export, compare contents and on-disk size
@@ -77,7 +77,7 @@ rbd rm sparse1
# 1M data, 1M sparse
rbd import $RBD_CREATE_ARGS --order 20 /tmp/sparse2
-rbd ls -l | grep sparse2 | grep '2048K'
+rbd ls -l | grep sparse2 | grep '2048k'
[ "$(objects sparse2)" = '0' ]
rbd export sparse2 /tmp/sparse2.out
compare_files_and_ondisk_sizes /tmp/sparse2 /tmp/sparse2.out
@@ -88,7 +88,7 @@ rbd rm sparse2
truncate /tmp/sparse1 -s 10M
# import from stdin just for fun, verify still sparse
rbd import $RBD_CREATE_ARGS --order 20 - sparse1 < /tmp/sparse1
-rbd ls -l | grep sparse1 | grep '10240K'
+rbd ls -l | grep sparse1 | grep '10240k'
[ "$(objects sparse1)" = '1' ]
rbd export sparse1 /tmp/sparse1.out
compare_files_and_ondisk_sizes /tmp/sparse1 /tmp/sparse1.out
@@ -99,7 +99,7 @@ rbd rm sparse1
dd if=/dev/urandom bs=2M count=1 of=/tmp/sparse2 oflag=append conv=notrunc
# again from stding
rbd import $RBD_CREATE_ARGS --order 20 - sparse2 < /tmp/sparse2
-rbd ls -l | grep sparse2 | grep '4096K'
+rbd ls -l | grep sparse2 | grep '4096k'
[ "$(objects sparse2)" = '0 2 3' ]
rbd export sparse2 /tmp/sparse2.out
compare_files_and_ondisk_sizes /tmp/sparse2 /tmp/sparse2.out
diff --git a/src/client/Client.cc b/src/client/Client.cc
index 77fd2084cf1..60a5e4550b8 100644
--- a/src/client/Client.cc
+++ b/src/client/Client.cc
@@ -148,9 +148,12 @@ Client::Client(Messenger *m, MonClient *mc)
timer(m->cct, client_lock),
ino_invalidate_cb(NULL),
ino_invalidate_cb_handle(NULL),
+ dentry_invalidate_cb(NULL),
+ dentry_invalidate_cb_handle(NULL),
getgroups_cb(NULL),
getgroups_cb_handle(NULL),
async_ino_invalidator(m->cct),
+ async_dentry_invalidator(m->cct),
tick_event(NULL),
monclient(mc), messenger(m), whoami(m->get_myname().num()),
initialized(false), mounted(false), unmounting(false),
@@ -410,11 +413,17 @@ void Client::shutdown()
admin_socket->unregister_command("dump_cache");
if (ino_invalidate_cb) {
- ldout(cct, 10) << "shutdown stopping invalidator finisher" << dendl;
+ ldout(cct, 10) << "shutdown stopping cache invalidator finisher" << dendl;
async_ino_invalidator.wait_for_empty();
async_ino_invalidator.stop();
}
+ if (dentry_invalidate_cb) {
+ ldout(cct, 10) << "shutdown stopping dentry invalidator finisher" << dendl;
+ async_dentry_invalidator.wait_for_empty();
+ async_dentry_invalidator.stop();
+ }
+
objectcacher->stop(); // outside of client_lock! this does a join.
client_lock.Lock();
@@ -1532,7 +1541,7 @@ void Client::_closed_mds_session(MetaSession *s)
signal_context_list(s->waiting_for_open);
mount_cond.Signal();
remove_session_caps(s);
- kick_requests(s, true);
+ kick_requests_closed(s);
mds_sessions.erase(s->mds_num);
delete s;
}
@@ -1905,7 +1914,7 @@ void Client::handle_mds_map(MMDSMap* m)
if (newstate >= MDSMap::STATE_ACTIVE) {
if (oldstate < MDSMap::STATE_ACTIVE) {
- kick_requests(p->second, false);
+ kick_requests(p->second);
kick_flushing_caps(p->second);
signal_context_list(p->second->waiting_for_open);
kick_maxsize_requests(p->second);
@@ -1989,25 +1998,16 @@ void Client::send_reconnect(MetaSession *session)
}
-void Client::kick_requests(MetaSession *session, bool signal)
+void Client::kick_requests(MetaSession *session)
{
ldout(cct, 10) << "kick_requests for mds." << session->mds_num << dendl;
-
for (map<tid_t, MetaRequest*>::iterator p = mds_requests.begin();
p != mds_requests.end();
- ++p)
+ ++p) {
if (p->second->mds == session->mds_num) {
- if (signal) {
- // only signal caller if there is a caller
- // otherwise, let resend_unsafe handle it
- if (p->second->caller_cond) {
- p->second->kick = true;
- p->second->caller_cond->Signal();
- }
- } else {
- send_request(p->second, session);
- }
+ send_request(p->second, session);
}
+ }
}
void Client::resend_unsafe_requests(MetaSession *session)
@@ -2018,6 +2018,25 @@ void Client::resend_unsafe_requests(MetaSession *session)
send_request(*iter, session);
}
+void Client::kick_requests_closed(MetaSession *session)
+{
+ ldout(cct, 10) << "kick_requests_closed for mds." << session->mds_num << dendl;
+ for (map<tid_t, MetaRequest*>::iterator p = mds_requests.begin();
+ p != mds_requests.end();
+ ++p) {
+ if (p->second->mds == session->mds_num) {
+ if (p->second->caller_cond) {
+ p->second->kick = true;
+ p->second->caller_cond->Signal();
+ }
+ p->second->item.remove_myself();
+ p->second->unsafe_item.remove_myself();
+ }
+ }
+ assert(session->requests.empty());
+ assert(session->unsafe_requests.empty());
+}
+
@@ -3551,6 +3570,45 @@ void Client::handle_cap_flushsnap_ack(MetaSession *session, Inode *in, MClientCa
m->put();
}
+class C_Client_DentryInvalidate : public Context {
+private:
+ Client *client;
+ vinodeno_t dirino;
+ vinodeno_t ino;
+ string name;
+public:
+ C_Client_DentryInvalidate(Client *c, Dentry *dn) :
+ client(c), dirino(dn->dir->parent_inode->vino()),
+ ino(dn->inode->vino()), name(dn->name) { }
+ void finish(int r) {
+ client->_async_dentry_invalidate(dirino, ino, name);
+ }
+};
+
+void Client::_async_dentry_invalidate(vinodeno_t dirino, vinodeno_t ino, string& name)
+{
+ ldout(cct, 10) << "_async_dentry_invalidate '" << name << "' ino " << ino
+ << " in dir " << dirino << dendl;
+ dentry_invalidate_cb(dentry_invalidate_cb_handle, dirino, ino, name);
+}
+
+void Client::_schedule_invalidate_dentry_callback(Dentry *dn)
+{
+ if (dentry_invalidate_cb && dn->inode->ll_ref > 0)
+ async_dentry_invalidator.queue(new C_Client_DentryInvalidate(this, dn));
+}
+
+void Client::_invalidate_inode_parents(Inode *in)
+{
+ set<Dentry*>::iterator q = in->dn_set.begin();
+ while (q != in->dn_set.end()) {
+ Dentry *dn = *q++;
+ // FIXME: we play lots of unlink/link tricks when handling MDS replies,
+ // so in->dn_set doesn't always reflect the state of kernel's dcache.
+ _schedule_invalidate_dentry_callback(dn);
+ unlink(dn, false);
+ }
+}
void Client::handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, MClientCaps *m)
{
@@ -3578,8 +3636,12 @@ void Client::handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, MClient
in->uid = m->head.uid;
in->gid = m->head.gid;
}
+ bool deleted_inode = false;
if ((issued & CEPH_CAP_LINK_EXCL) == 0) {
in->nlink = m->head.nlink;
+ if (in->nlink == 0 &&
+ (new_caps & (CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL)))
+ deleted_inode = true;
}
if ((issued & CEPH_CAP_XATTR_EXCL) == 0 &&
m->xattrbl.length() &&
@@ -3633,6 +3695,10 @@ void Client::handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, MClient
if (new_caps)
signal_cond_list(in->waitfor_caps);
+ // may drop inode's last ref
+ if (deleted_inode)
+ _invalidate_inode_parents(in);
+
m->put();
}
@@ -6319,6 +6385,17 @@ void Client::ll_register_ino_invalidate_cb(client_ino_callback_t cb, void *handl
async_ino_invalidator.start();
}
+void Client::ll_register_dentry_invalidate_cb(client_dentry_callback_t cb, void *handle)
+{
+ Mutex::Locker l(client_lock);
+ ldout(cct, 10) << "ll_register_dentry_invalidate_cb cb " << (void*)cb << " p " << (void*)handle << dendl;
+ if (cb == NULL)
+ return;
+ dentry_invalidate_cb = cb;
+ dentry_invalidate_cb_handle = handle;
+ async_dentry_invalidator.start();
+}
+
void Client::ll_register_getgroups_cb(client_getgroups_callback_t cb, void *handle)
{
Mutex::Locker l(client_lock);
diff --git a/src/client/Client.h b/src/client/Client.h
index c7c9cef0e0c..df59f235de4 100644
--- a/src/client/Client.h
+++ b/src/client/Client.h
@@ -120,6 +120,9 @@ struct MetaRequest;
typedef void (*client_ino_callback_t)(void *handle, vinodeno_t ino, int64_t off, int64_t len);
+typedef void (*client_dentry_callback_t)(void *handle, vinodeno_t dirino,
+ vinodeno_t ino, string& name);
+
typedef int (*client_getgroups_callback_t)(void *handle, uid_t uid, gid_t **sgids);
// ========================================================
@@ -211,10 +214,14 @@ class Client : public Dispatcher {
client_ino_callback_t ino_invalidate_cb;
void *ino_invalidate_cb_handle;
+ client_dentry_callback_t dentry_invalidate_cb;
+ void *dentry_invalidate_cb_handle;
+
client_getgroups_callback_t getgroups_cb;
void *getgroups_cb_handle;
Finisher async_ino_invalidator;
+ Finisher async_dentry_invalidator;
Context *tick_event;
utime_t last_cap_renew;
@@ -270,7 +277,8 @@ public:
void connect_mds_targets(int mds);
void send_request(MetaRequest *request, MetaSession *session);
MClientRequest *build_client_request(MetaRequest *request);
- void kick_requests(MetaSession *session, bool signal);
+ void kick_requests(MetaSession *session);
+ void kick_requests_closed(MetaSession *session);
void handle_client_request_forward(MClientRequestForward *reply);
void handle_client_reply(MClientReply *reply);
@@ -357,6 +365,7 @@ protected:
friend class C_Client_PutInode; // calls put_inode()
friend class C_Client_CacheInvalidate; // calls ino_invalidate_cb
+ friend class C_Client_DentryInvalidate; // calls dentry_invalidate_cb
//int get_cache_size() { return lru.lru_get_size(); }
//void set_cache_size(int m) { lru.lru_set_max(m); }
@@ -459,6 +468,10 @@ protected:
void finish_cap_snap(Inode *in, CapSnap *capsnap, int used);
void _flushed_cap_snap(Inode *in, snapid_t seq);
+ void _schedule_invalidate_dentry_callback(Dentry *dn);
+ void _async_dentry_invalidate(vinodeno_t dirino, vinodeno_t ino, string& name);
+ void _invalidate_inode_parents(Inode *in);
+
void _schedule_invalidate_callback(Inode *in, int64_t off, int64_t len, bool keep_caps);
void _invalidate_inode_cache(Inode *in, bool keep_caps);
void _invalidate_inode_cache(Inode *in, int64_t off, int64_t len, bool keep_caps);
@@ -735,6 +748,8 @@ public:
void ll_register_ino_invalidate_cb(client_ino_callback_t cb, void *handle);
+ void ll_register_dentry_invalidate_cb(client_dentry_callback_t cb, void *handle);
+
void ll_register_getgroups_cb(client_getgroups_callback_t cb, void *handle);
};
diff --git a/src/client/fuse_ll.cc b/src/client/fuse_ll.cc
index 6bf5ea3d34f..88f727e454e 100644
--- a/src/client/fuse_ll.cc
+++ b/src/client/fuse_ll.cc
@@ -12,7 +12,7 @@
*
*/
-#define FUSE_USE_VERSION 26
+#define FUSE_USE_VERSION 30
#include <fuse/fuse.h>
#include <fuse/fuse_lowlevel.h>
@@ -551,7 +551,7 @@ static int getgroups_cb(void *handle, uid_t uid, gid_t **sgids)
}
#endif
-static void invalidate_cb(void *handle, vinodeno_t vino, int64_t off, int64_t len)
+static void ino_invalidate_cb(void *handle, vinodeno_t vino, int64_t off, int64_t len)
{
#if FUSE_VERSION >= FUSE_MAKE_VERSION(2, 8)
CephFuse::Handle *cfuse = (CephFuse::Handle *)handle;
@@ -560,6 +560,19 @@ static void invalidate_cb(void *handle, vinodeno_t vino, int64_t off, int64_t le
#endif
}
+static void dentry_invalidate_cb(void *handle, vinodeno_t dirino,
+ vinodeno_t ino, string& name)
+{
+ CephFuse::Handle *cfuse = (CephFuse::Handle *)handle;
+ fuse_ino_t fdirino = cfuse->make_fake_ino(dirino.ino, dirino.snapid);
+#if FUSE_VERSION >= FUSE_MAKE_VERSION(2, 9)
+ fuse_ino_t fino = cfuse->make_fake_ino(ino.ino, ino.snapid);
+ fuse_lowlevel_notify_delete(cfuse->ch, fdirino, fino, name.c_str(), name.length());
+#elif FUSE_VERSION >= FUSE_MAKE_VERSION(2, 8)
+ fuse_lowlevel_notify_inval_entry(cfuse->ch, fdirino, name.c_str(), name.length());
+#endif
+}
+
static void do_init(void *data, fuse_conn_info *bar)
{
CephFuse::Handle *cfuse = (CephFuse::Handle *)data;
@@ -743,9 +756,10 @@ int CephFuse::Handle::init(int argc, const char *argv[])
client->ll_register_getgroups_cb(getgroups_cb, this);
*/
+ client->ll_register_dentry_invalidate_cb(dentry_invalidate_cb, this);
if (client->cct->_conf->fuse_use_invalidate_cb)
- client->ll_register_ino_invalidate_cb(invalidate_cb, this);
+ client->ll_register_ino_invalidate_cb(ino_invalidate_cb, this);
done:
fuse_opt_free_args(&args);
diff --git a/src/common/bloom_filter.cc b/src/common/bloom_filter.cc
index f602b80149e..a1c20bf0c12 100644
--- a/src/common/bloom_filter.cc
+++ b/src/common/bloom_filter.cc
@@ -6,26 +6,26 @@
void bloom_filter::encode(bufferlist& bl) const
{
- ENCODE_START(1, 1, bl);
+ ENCODE_START(2, 2, bl);
::encode((uint64_t)salt_count_, bl);
- ::encode((uint64_t)table_size_, bl);
- ::encode((uint64_t)inserted_element_count_, bl);
+ ::encode((uint64_t)insert_count_, bl);
+ ::encode((uint64_t)target_element_count_, bl);
::encode((uint64_t)random_seed_, bl);
- bufferptr bp((const char*)bit_table_, raw_table_size_);
+ bufferptr bp((const char*)bit_table_, table_size_);
::encode(bp, bl);
ENCODE_FINISH(bl);
}
void bloom_filter::decode(bufferlist::iterator& p)
{
- DECODE_START(1, p);
+ DECODE_START(2, p);
uint64_t v;
::decode(v, p);
salt_count_ = v;
::decode(v, p);
- table_size_ = v;
+ insert_count_ = v;
::decode(v, p);
- inserted_element_count_ = v;
+ target_element_count_ = v;
::decode(v, p);
random_seed_ = v;
bufferlist t;
@@ -33,11 +33,14 @@ void bloom_filter::decode(bufferlist::iterator& p)
salt_.clear();
generate_unique_salt();
- raw_table_size_ = t.length();
- assert(raw_table_size_ == table_size_ / bits_per_char);
+ table_size_ = t.length();
delete bit_table_;
- bit_table_ = new cell_type[raw_table_size_];
- t.copy(0, raw_table_size_, (char *)bit_table_);
+ if (table_size_) {
+ bit_table_ = new cell_type[table_size_];
+ t.copy(0, table_size_, (char *)bit_table_);
+ } else {
+ bit_table_ = NULL;
+ }
DECODE_FINISH(p);
}
@@ -46,8 +49,8 @@ void bloom_filter::dump(Formatter *f) const
{
f->dump_unsigned("salt_count", salt_count_);
f->dump_unsigned("table_size", table_size_);
- f->dump_unsigned("raw_table_size", raw_table_size_);
- f->dump_unsigned("insert_count", inserted_element_count_);
+ f->dump_unsigned("insert_count", insert_count_);
+ f->dump_unsigned("target_element_count", target_element_count_);
f->dump_unsigned("random_seed", random_seed_);
f->open_array_section("salt_table");
@@ -56,21 +59,79 @@ void bloom_filter::dump(Formatter *f) const
f->close_section();
f->open_array_section("bit_table");
- for (unsigned i = 0; i < raw_table_size_; ++i)
+ for (unsigned i = 0; i < table_size_; ++i)
f->dump_unsigned("byte", (unsigned)bit_table_[i]);
f->close_section();
}
void bloom_filter::generate_test_instances(list<bloom_filter*>& ls)
{
- ls.push_back(new bloom_filter(10, .5, 1));
- ls.push_back(new bloom_filter(10, .5, 1));
+ ls.push_back(new bloom_filter(10, .5, 1, 5));
+ ls.push_back(new bloom_filter(10, .5, 1, 5));
ls.back()->insert("foo");
ls.back()->insert("bar");
- ls.push_back(new bloom_filter(50, .5, 1));
+ ls.push_back(new bloom_filter(50, .5, 1, 5));
ls.back()->insert("foo");
ls.back()->insert("bar");
ls.back()->insert("baz");
ls.back()->insert("boof");
ls.back()->insert("boogggg");
}
+
+
+void compressible_bloom_filter::encode(bufferlist& bl) const
+{
+ ENCODE_START(2, 2, bl);
+ bloom_filter::encode(bl);
+
+ uint32_t s = size_list.size();
+ ::encode(s, bl);
+ for (vector<size_t>::const_iterator p = size_list.begin();
+ p != size_list.end(); ++p)
+ ::encode((uint64_t)*p, bl);
+
+ ENCODE_FINISH(bl);
+}
+
+void compressible_bloom_filter::decode(bufferlist::iterator& p)
+{
+ DECODE_START(2, p);
+ bloom_filter::decode(p);
+
+ uint32_t s;
+ ::decode(s, p);
+ size_list.resize(s);
+ for (unsigned i = 0; i < s; i++) {
+ uint64_t v;
+ ::decode(v, p);
+ size_list[i] = v;
+ }
+
+ DECODE_FINISH(p);
+}
+
+void compressible_bloom_filter::dump(Formatter *f) const
+{
+ bloom_filter::dump(f);
+
+ f->open_array_section("table_sizes");
+ for (vector<size_t>::const_iterator p = size_list.begin();
+ p != size_list.end(); ++p)
+ f->dump_unsigned("size", (uint64_t)*p);
+ f->close_section();
+}
+
+void compressible_bloom_filter::generate_test_instances(list<compressible_bloom_filter*>& ls)
+{
+ ls.push_back(new compressible_bloom_filter(10, .5, 1, 4));
+ ls.push_back(new compressible_bloom_filter(10, .5, 1, 4));
+ ls.back()->insert("foo");
+ ls.back()->insert("bar");
+ ls.push_back(new compressible_bloom_filter(50, .5, 1, 4));
+ ls.back()->insert("foo");
+ ls.back()->insert("bar");
+ ls.back()->insert("baz");
+ ls.back()->insert("boof");
+ ls.back()->compress(20);
+ ls.back()->insert("boogggg");
+}
diff --git a/src/common/bloom_filter.hpp b/src/common/bloom_filter.hpp
index 6216c7fb34d..93787a89a60 100644
--- a/src/common/bloom_filter.hpp
+++ b/src/common/bloom_filter.hpp
@@ -53,14 +53,22 @@ protected:
typedef unsigned int bloom_type;
typedef unsigned char cell_type;
+ unsigned char* bit_table_; ///< pointer to bit map
+ std::vector<bloom_type> salt_; ///< vector of salts
+ std::size_t salt_count_; ///< number of salts
+ std::size_t table_size_; ///< bit table size in bytes
+ std::size_t insert_count_; ///< insertion count
+ std::size_t target_element_count_; ///< target number of unique insertions
+ std::size_t random_seed_; ///< random seed
+
public:
bloom_filter()
: bit_table_(0),
salt_count_(0),
table_size_(0),
- raw_table_size_(0),
- inserted_element_count_(0),
+ insert_count_(0),
+ target_element_count_(0),
random_seed_(0)
{}
@@ -68,7 +76,8 @@ public:
const double& false_positive_probability,
const std::size_t& random_seed)
: bit_table_(0),
- inserted_element_count_(0),
+ insert_count_(0),
+ target_element_count_(predicted_inserted_element_count),
random_seed_((random_seed) ? random_seed : 0xA5A5A5A5)
{
find_optimal_parameters(predicted_inserted_element_count, false_positive_probability,
@@ -76,12 +85,15 @@ public:
init();
}
- bloom_filter(const std::size_t& salt_count, std::size_t table_size,
- const std::size_t& random_seed)
+ bloom_filter(const std::size_t& salt_count,
+ std::size_t table_size,
+ const std::size_t& random_seed,
+ std::size_t target_element_count)
: bit_table_(0),
salt_count_(salt_count),
table_size_(table_size),
- inserted_element_count_(0),
+ insert_count_(0),
+ target_element_count_(target_element_count),
random_seed_((random_seed) ? random_seed : 0xA5A5A5A5)
{
init();
@@ -89,9 +101,12 @@ public:
void init() {
generate_unique_salt();
- raw_table_size_ = table_size_ / bits_per_char;
- bit_table_ = new cell_type[raw_table_size_];
- std::fill_n(bit_table_,raw_table_size_,0x00);
+ if (table_size_) {
+ bit_table_ = new cell_type[table_size_];
+ std::fill_n(bit_table_, table_size_, 0x00);
+ } else {
+ bit_table_ = NULL;
+ }
}
bloom_filter(const bloom_filter& filter)
@@ -104,12 +119,11 @@ public:
if (this != &filter) {
salt_count_ = filter.salt_count_;
table_size_ = filter.table_size_;
- raw_table_size_ = filter.raw_table_size_;
- inserted_element_count_ = filter.inserted_element_count_;
+ insert_count_ = filter.insert_count_;
random_seed_ = filter.random_seed_;
delete[] bit_table_;
- bit_table_ = new cell_type[raw_table_size_];
- std::copy(filter.bit_table_,filter.bit_table_ + raw_table_size_,bit_table_);
+ bit_table_ = new cell_type[table_size_];
+ std::copy(filter.bit_table_, filter.bit_table_ + table_size_, bit_table_);
salt_ = filter.salt_;
}
return *this;
@@ -127,8 +141,9 @@ public:
inline void clear()
{
- std::fill_n(bit_table_,raw_table_size_,0x00);
- inserted_element_count_ = 0;
+ if (bit_table_)
+ std::fill_n(bit_table_, table_size_, 0x00);
+ insert_count_ = 0;
}
/**
@@ -141,26 +156,28 @@ public:
* @param val integer value to insert
*/
inline void insert(uint32_t val) {
+ assert(bit_table_);
std::size_t bit_index = 0;
std::size_t bit = 0;
for (std::size_t i = 0; i < salt_.size(); ++i)
{
compute_indices(hash_ap(val,salt_[i]),bit_index,bit);
- bit_table_[bit_index / bits_per_char] |= bit_mask[bit];
+ bit_table_[bit_index >> 3] |= bit_mask[bit];
}
- ++inserted_element_count_;
+ ++insert_count_;
}
inline void insert(const unsigned char* key_begin, const std::size_t& length)
{
+ assert(bit_table_);
std::size_t bit_index = 0;
std::size_t bit = 0;
for (std::size_t i = 0; i < salt_.size(); ++i)
{
compute_indices(hash_ap(key_begin,length,salt_[i]),bit_index,bit);
- bit_table_[bit_index / bits_per_char] |= bit_mask[bit];
+ bit_table_[bit_index >> 3] |= bit_mask[bit];
}
- ++inserted_element_count_;
+ ++insert_count_;
}
template<typename T>
@@ -202,12 +219,14 @@ public:
*/
inline virtual bool contains(uint32_t val) const
{
+ if (!bit_table_)
+ return false;
std::size_t bit_index = 0;
std::size_t bit = 0;
for (std::size_t i = 0; i < salt_.size(); ++i)
{
compute_indices(hash_ap(val,salt_[i]),bit_index,bit);
- if ((bit_table_[bit_index / bits_per_char] & bit_mask[bit]) != bit_mask[bit])
+ if ((bit_table_[bit_index >> 3] & bit_mask[bit]) != bit_mask[bit])
{
return false;
}
@@ -217,12 +236,14 @@ public:
inline virtual bool contains(const unsigned char* key_begin, const std::size_t length) const
{
+ if (!bit_table_)
+ return false;
std::size_t bit_index = 0;
std::size_t bit = 0;
for (std::size_t i = 0; i < salt_.size(); ++i)
{
compute_indices(hash_ap(key_begin,length,salt_[i]),bit_index,bit);
- if ((bit_table_[bit_index / bits_per_char] & bit_mask[bit]) != bit_mask[bit])
+ if ((bit_table_[bit_index >> 3] & bit_mask[bit]) != bit_mask[bit])
{
return false;
}
@@ -278,12 +299,41 @@ public:
inline virtual std::size_t size() const
{
- return table_size_;
+ return table_size_ * bits_per_char;
}
inline std::size_t element_count() const
{
- return inserted_element_count_;
+ return insert_count_;
+ }
+
+ /*
+ * density of bits set. inconvenient units, but:
+ * .3 = ~50% target insertions
+ * .5 = 100% target insertions, "perfectly full"
+ * .75 = 200% target insertions
+ * 1.0 = all bits set... infinite insertions
+ */
+ inline double density() const
+ {
+ if (!bit_table_)
+ return 0.0;
+ size_t set = 0;
+ uint8_t *p = bit_table_;
+ size_t left = table_size_;
+ while (left-- > 0) {
+ uint8_t c = *p;
+ for (; c; ++set)
+ c &= c - 1;
+ ++p;
+ }
+ return (double)set / (double)(table_size_ << 3);
+ }
+
+ virtual inline double approx_unique_element_count() const {
+ // this is not a very good estimate; a better solution should have
+ // some asymptotic behavior as density() approaches 1.0.
+ return (double)target_element_count_ * 2.0 * density();
}
inline double effective_fpp() const
@@ -295,7 +345,7 @@ public:
the current number of inserted elements - not the user defined
predicated/expected number of inserted elements.
*/
- return std::pow(1.0 - std::exp(-1.0 * salt_.size() * inserted_element_count_ / size()), 1.0 * salt_.size());
+ return std::pow(1.0 - std::exp(-1.0 * salt_.size() * insert_count_ / size()), 1.0 * salt_.size());
}
inline bloom_filter& operator &= (const bloom_filter& filter)
@@ -306,7 +356,7 @@ public:
(table_size_ == filter.table_size_) &&
(random_seed_ == filter.random_seed_)
) {
- for (std::size_t i = 0; i < raw_table_size_; ++i) {
+ for (std::size_t i = 0; i < table_size_; ++i) {
bit_table_[i] &= filter.bit_table_[i];
}
}
@@ -321,7 +371,7 @@ public:
(table_size_ == filter.table_size_) &&
(random_seed_ == filter.random_seed_)
) {
- for (std::size_t i = 0; i < raw_table_size_; ++i) {
+ for (std::size_t i = 0; i < table_size_; ++i) {
bit_table_[i] |= filter.bit_table_[i];
}
}
@@ -336,7 +386,7 @@ public:
(table_size_ == filter.table_size_) &&
(random_seed_ == filter.random_seed_)
) {
- for (std::size_t i = 0; i < raw_table_size_; ++i) {
+ for (std::size_t i = 0; i < table_size_; ++i) {
bit_table_[i] ^= filter.bit_table_[i];
}
}
@@ -352,8 +402,8 @@ protected:
inline virtual void compute_indices(const bloom_type& hash, std::size_t& bit_index, std::size_t& bit) const
{
- bit_index = hash % table_size_;
- bit = bit_index % bits_per_char;
+ bit_index = hash % (table_size_ << 3);
+ bit = bit_index & 7;
}
void generate_unique_salt()
@@ -418,7 +468,8 @@ protected:
}
else
{
- std::copy(predef_salt,predef_salt + predef_salt_count,std::back_inserter(salt_));
+ std::copy(predef_salt,predef_salt + predef_salt_count,
+ std::back_inserter(salt_));
srand(static_cast<unsigned int>(random_seed_));
while (salt_.size() < salt_count_)
{
@@ -466,8 +517,8 @@ protected:
*salt_count = static_cast<std::size_t>(min_k);
size_t t = static_cast<std::size_t>(min_m);
- t += (((t % bits_per_char) != 0) ? (bits_per_char - (t % bits_per_char)) : 0);
- *table_size = t;
+ t += (((t & 7) != 0) ? (bits_per_char - (t & 7)) : 0);
+ *table_size = t >> 3;
}
inline bloom_type hash_ap(uint32_t val, bloom_type hash) const
@@ -507,14 +558,6 @@ protected:
return hash;
}
- std::vector<bloom_type> salt_;
- unsigned char* bit_table_;
- std::size_t salt_count_;
- std::size_t table_size_;
- std::size_t raw_table_size_;
- std::size_t inserted_element_count_;
- std::size_t random_seed_;
-
public:
void encode(bufferlist& bl) const;
void decode(bufferlist::iterator& bl);
@@ -549,53 +592,77 @@ class compressible_bloom_filter : public bloom_filter
{
public:
+ compressible_bloom_filter() : bloom_filter() {}
+
compressible_bloom_filter(const std::size_t& predicted_element_count,
const double& false_positive_probability,
const std::size_t& random_seed)
- : bloom_filter(predicted_element_count,false_positive_probability,random_seed)
+ : bloom_filter(predicted_element_count, false_positive_probability, random_seed)
+ {
+ size_list.push_back(table_size_);
+ }
+
+ compressible_bloom_filter(const std::size_t& salt_count,
+ std::size_t table_size,
+ const std::size_t& random_seed,
+ std::size_t target_count)
+ : bloom_filter(salt_count, table_size, random_seed, target_count)
{
size_list.push_back(table_size_);
}
inline virtual std::size_t size() const
{
- return size_list.back();
+ return size_list.back() * bits_per_char;
}
- inline bool compress(const double& percentage)
+ inline bool compress(const double& target_ratio)
{
- if ((0.0 >= percentage) || (percentage >= 100.0))
+ if (!bit_table_)
+ return false;
+
+ if ((0.0 >= target_ratio) || (target_ratio >= 1.0))
{
return false;
}
std::size_t original_table_size = size_list.back();
- std::size_t new_table_size = static_cast<std::size_t>((size_list.back() * (1.0 - (percentage / 100.0))));
- new_table_size -= (((new_table_size % bits_per_char) != 0) ? (new_table_size % bits_per_char) : 0);
+ std::size_t new_table_size = static_cast<std::size_t>(size_list.back() * target_ratio);
- if ((bits_per_char > new_table_size) || (new_table_size >= original_table_size))
+ if ((!new_table_size) || (new_table_size >= original_table_size))
{
return false;
}
- cell_type* tmp = new cell_type[new_table_size / bits_per_char];
- std::copy(bit_table_, bit_table_ + (new_table_size / bits_per_char), tmp);
- cell_type* itr = bit_table_ + (new_table_size / bits_per_char);
- cell_type* end = bit_table_ + (original_table_size / bits_per_char);
+ cell_type* tmp = new cell_type[new_table_size];
+ std::copy(bit_table_, bit_table_ + (new_table_size), tmp);
+ cell_type* itr = bit_table_ + (new_table_size);
+ cell_type* end = bit_table_ + (original_table_size);
cell_type* itr_tmp = tmp;
-
+ cell_type* itr_end = tmp + (new_table_size);
while (end != itr)
{
*(itr_tmp++) |= (*itr++);
+ if (itr_tmp == itr_end)
+ itr_tmp = tmp;
}
delete[] bit_table_;
bit_table_ = tmp;
size_list.push_back(new_table_size);
+ table_size_ = new_table_size;
return true;
}
+ virtual inline double approx_unique_element_count() const {
+ // this is not a very good estimate; a better solution should have
+ // some asymptotic behavior as density() approaches 1.0.
+ //
+ // the compress() correction is also bad; it tends to under-estimate.
+ return (double)target_element_count_ * 2.0 * density() * (double)size_list.back() / (double)size_list.front();
+ }
+
private:
inline virtual void compute_indices(const bloom_type& hash, std::size_t& bit_index, std::size_t& bit) const
@@ -603,13 +670,19 @@ private:
bit_index = hash;
for (std::size_t i = 0; i < size_list.size(); ++i)
{
- bit_index %= size_list[i];
+ bit_index %= size_list[i] << 3;
}
- bit = bit_index % bits_per_char;
+ bit = bit_index & 7;
}
std::vector<std::size_t> size_list;
+public:
+ void encode(bufferlist& bl) const;
+ void decode(bufferlist::iterator& bl);
+ void dump(Formatter *f) const;
+ static void generate_test_instances(std::list<compressible_bloom_filter*>& ls);
};
+WRITE_CLASS_ENCODER(compressible_bloom_filter)
#endif
diff --git a/src/common/config_opts.h b/src/common/config_opts.h
index fad831f5543..2d3f981379b 100644
--- a/src/common/config_opts.h
+++ b/src/common/config_opts.h
@@ -544,12 +544,19 @@ OPTION(filestore_index_retry_probability, OPT_DOUBLE, 0)
OPTION(filestore_debug_inject_read_err, OPT_BOOL, false)
OPTION(filestore_debug_omap_check, OPT_BOOL, 0) // Expensive debugging check on sync
+
// Use omap for xattrs for attrs over
-OPTION(filestore_xattr_use_omap, OPT_BOOL, false)
// filestore_max_inline_xattr_size or
-OPTION(filestore_max_inline_xattr_size, OPT_U32, 512)
+OPTION(filestore_max_inline_xattr_size, OPT_U32, 0) //Override
+OPTION(filestore_max_inline_xattr_size_xfs, OPT_U32, 65536)
+OPTION(filestore_max_inline_xattr_size_btrfs, OPT_U32, 2048)
+OPTION(filestore_max_inline_xattr_size_other, OPT_U32, 512)
+
// for more than filestore_max_inline_xattrs attrs
-OPTION(filestore_max_inline_xattrs, OPT_U32, 2)
+OPTION(filestore_max_inline_xattrs, OPT_U32, 0) //Override
+OPTION(filestore_max_inline_xattrs_xfs, OPT_U32, 10)
+OPTION(filestore_max_inline_xattrs_btrfs, OPT_U32, 10)
+OPTION(filestore_max_inline_xattrs_other, OPT_U32, 2)
OPTION(filestore_sloppy_crc, OPT_BOOL, false) // track sloppy crcs
OPTION(filestore_sloppy_crc_block_size, OPT_INT, 65536)
diff --git a/src/common/hobject.h b/src/common/hobject.h
index 82eecf3bfc7..a769ad060d9 100644
--- a/src/common/hobject.h
+++ b/src/common/hobject.h
@@ -247,6 +247,10 @@ public:
return hobj.get_filestore_key();
}
+ bool is_degenerate() const {
+ return generation == NO_GEN && shard_id == NO_SHARD;
+ }
+
// maximum sorted value.
static ghobject_t get_max() {
ghobject_t h(hobject_t::get_max());
diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc
index 2c64a8f2ef2..d8c90bc3d76 100644
--- a/src/mon/Monitor.cc
+++ b/src/mon/Monitor.cc
@@ -2561,67 +2561,98 @@ bool Monitor::_ms_dispatch(Message *m)
EntityName entity_name;
bool src_is_mon;
- src_is_mon = !connection || (connection->get_peer_type() & CEPH_ENTITY_TYPE_MON);
-
- if (connection) {
- bool reuse_caps = false;
- dout(20) << "have connection" << dendl;
- s = static_cast<MonSession *>(connection->get_priv());
- if (s && s->closed) {
- caps = s->caps;
- reuse_caps = true;
- s->put();
- s = NULL;
+ // regardless of who we are or who the sender is, the message must
+ // have a connection associated. If it doesn't then something fishy
+ // is going on.
+ assert(connection);
+
+ src_is_mon = (connection->get_peer_type() & CEPH_ENTITY_TYPE_MON);
+
+ bool reuse_caps = false;
+ dout(20) << "have connection" << dendl;
+ s = static_cast<MonSession *>(connection->get_priv());
+ if (s && s->closed) {
+ caps = s->caps;
+ reuse_caps = true;
+ s->put();
+ s = NULL;
+ }
+ if (!s) {
+ // if the sender is not a monitor, make sure their first message for a
+ // session is an MAuth. If it is not, assume it's a stray message,
+ // and considering that we are creating a new session it is safe to
+ // assume that the sender hasn't authenticated yet, so we have no way
+ // of assessing whether we should handle it or not.
+ if (!src_is_mon && (m->get_type() != CEPH_MSG_AUTH &&
+ m->get_type() != CEPH_MSG_MON_GET_MAP)) {
+ dout(1) << __func__ << " dropping stray message " << *m
+ << " from " << m->get_source_inst() << dendl;
+ return false;
}
- if (!s) {
- if (!exited_quorum.is_zero() && !src_is_mon) {
- waitlist_or_zap_client(m);
- return true;
- }
- dout(10) << "do not have session, making new one" << dendl;
- s = session_map.new_session(m->get_source_inst(), m->get_connection().get());
- m->get_connection()->set_priv(s->get());
- dout(10) << "ms_dispatch new session " << s << " for " << s->inst << dendl;
-
- if (m->get_connection()->get_peer_type() != CEPH_ENTITY_TYPE_MON) {
- dout(10) << "setting timeout on session" << dendl;
- // set an initial timeout here, so we will trim this session even if they don't
- // do anything.
- s->until = ceph_clock_now(g_ceph_context);
- s->until += g_conf->mon_subscribe_interval;
- } else {
- //give it monitor caps; the peer type has been authenticated
- reuse_caps = false;
- dout(5) << "setting monitor caps on this connection" << dendl;
- if (!s->caps.is_allow_all()) //but no need to repeatedly copy
- s->caps = *mon_caps;
- }
- if (reuse_caps)
- s->caps = caps;
+
+ if (!exited_quorum.is_zero() && !src_is_mon) {
+ waitlist_or_zap_client(m);
+ return true;
+ }
+
+ dout(10) << "do not have session, making new one" << dendl;
+ s = session_map.new_session(m->get_source_inst(), m->get_connection().get());
+ m->get_connection()->set_priv(s->get());
+ dout(10) << "ms_dispatch new session " << s << " for " << s->inst << dendl;
+
+ if (!src_is_mon) {
+ dout(10) << "setting timeout on session" << dendl;
+ // set an initial timeout here, so we will trim this session even if they don't
+ // do anything.
+ s->until = ceph_clock_now(g_ceph_context);
+ s->until += g_conf->mon_subscribe_interval;
} else {
- dout(20) << "ms_dispatch existing session " << s << " for " << s->inst << dendl;
+ //give it monitor caps; the peer type has been authenticated
+ reuse_caps = false;
+ dout(5) << "setting monitor caps on this connection" << dendl;
+ if (!s->caps.is_allow_all()) //but no need to repeatedly copy
+ s->caps = *mon_caps;
}
+ if (reuse_caps)
+ s->caps = caps;
+ } else {
+ dout(20) << "ms_dispatch existing session " << s << " for " << s->inst << dendl;
+ }
+
+ if (s) {
if (s->auth_handler) {
entity_name = s->auth_handler->get_entity_name();
}
- }
-
- if (s)
dout(20) << " caps " << s->caps.get_str() << dendl;
+ }
if (is_synchronizing() && !src_is_mon) {
waitlist_or_zap_client(m);
return true;
}
- {
- switch (m->get_type()) {
-
+ ret = dispatch(s, m, src_is_mon);
+
+ if (s) {
+ s->put();
+ }
+
+ return ret;
+}
+
+bool Monitor::dispatch(MonSession *s, Message *m, const bool src_is_mon)
+{
+ bool ret = true;
+
+ assert(m != NULL);
+
+ switch (m->get_type()) {
+
case MSG_ROUTE:
handle_route(static_cast<MRoute*>(m));
break;
- // misc
+ // misc
case CEPH_MSG_MON_GET_MAP:
handle_mon_get_map(static_cast<MMonGetMap*>(m));
break;
@@ -2647,12 +2678,11 @@ bool Monitor::_ms_dispatch(Message *m)
case MSG_MON_SYNC:
handle_sync(static_cast<MMonSync*>(m));
break;
-
case MSG_MON_SCRUB:
handle_scrub(static_cast<MMonScrub*>(m));
break;
- // OSDs
+ // OSDs
case MSG_OSD_MARK_ME_DOWN:
case MSG_OSD_FAILURE:
case MSG_OSD_BOOT:
@@ -2665,20 +2695,20 @@ bool Monitor::_ms_dispatch(Message *m)
paxos_service[PAXOS_OSDMAP]->dispatch((PaxosServiceMessage*)m);
break;
- // MDSs
+ // MDSs
case MSG_MDS_BEACON:
case MSG_MDS_OFFLOAD_TARGETS:
paxos_service[PAXOS_MDSMAP]->dispatch((PaxosServiceMessage*)m);
break;
- // auth
+ // auth
case MSG_MON_GLOBAL_ID:
case CEPH_MSG_AUTH:
/* no need to check caps here */
paxos_service[PAXOS_AUTH]->dispatch((PaxosServiceMessage*)m);
break;
- // pg
+ // pg
case CEPH_MSG_STATFS:
case MSG_PGSTATS:
case MSG_GETPOOLSTATS:
@@ -2689,7 +2719,7 @@ bool Monitor::_ms_dispatch(Message *m)
paxos_service[PAXOS_OSDMAP]->dispatch((PaxosServiceMessage*)m);
break;
- // log
+ // log
case MSG_LOG:
paxos_service[PAXOS_LOG]->dispatch((PaxosServiceMessage*)m);
break;
@@ -2698,60 +2728,60 @@ bool Monitor::_ms_dispatch(Message *m)
clog.handle_log_ack((MLogAck*)m);
break;
- // monmap
+ // monmap
case MSG_MON_JOIN:
paxos_service[PAXOS_MONMAP]->dispatch((PaxosServiceMessage*)m);
break;
- // paxos
+ // paxos
case MSG_MON_PAXOS:
{
- MMonPaxos *pm = static_cast<MMonPaxos*>(m);
- if (!src_is_mon &&
- !s->is_capable("mon", MON_CAP_X)) {
- //can't send these!
- pm->put();
- break;
- }
+ MMonPaxos *pm = static_cast<MMonPaxos*>(m);
+ if (!src_is_mon ||
+ !s->is_capable("mon", MON_CAP_X)) {
+ //can't send these!
+ pm->put();
+ break;
+ }
- if (state == STATE_SYNCHRONIZING) {
- // we are synchronizing. These messages would do us no
- // good, thus just drop them and ignore them.
- dout(10) << __func__ << " ignore paxos msg from "
- << pm->get_source_inst() << dendl;
- pm->put();
- break;
- }
+ if (state == STATE_SYNCHRONIZING) {
+ // we are synchronizing. These messages would do us no
+ // good, thus just drop them and ignore them.
+ dout(10) << __func__ << " ignore paxos msg from "
+ << pm->get_source_inst() << dendl;
+ pm->put();
+ break;
+ }
- // sanitize
- if (pm->epoch > get_epoch()) {
- bootstrap();
- pm->put();
- break;
- }
- if (pm->epoch != get_epoch()) {
- pm->put();
- break;
- }
+ // sanitize
+ if (pm->epoch > get_epoch()) {
+ bootstrap();
+ pm->put();
+ break;
+ }
+ if (pm->epoch != get_epoch()) {
+ pm->put();
+ break;
+ }
- paxos->dispatch((PaxosServiceMessage*)m);
+ paxos->dispatch((PaxosServiceMessage*)m);
}
break;
- // elector messages
+ // elector messages
case MSG_MON_ELECTION:
//check privileges here for simplicity
if (s &&
- !s->is_capable("mon", MON_CAP_X)) {
- dout(0) << "MMonElection received from entity without enough caps!"
- << s->caps << dendl;
- m->put();
- break;
+ !s->is_capable("mon", MON_CAP_X)) {
+ dout(0) << "MMonElection received from entity without enough caps!"
+ << s->caps << dendl;
+ m->put();
+ break;
}
if (!is_probing() && !is_synchronizing()) {
- elector.dispatch(m);
+ elector.dispatch(m);
} else {
- m->put();
+ m->put();
}
break;
@@ -2769,10 +2799,6 @@ bool Monitor::_ms_dispatch(Message *m)
default:
ret = false;
- }
- }
- if (s) {
- s->put();
}
return ret;
diff --git a/src/mon/Monitor.h b/src/mon/Monitor.h
index 9b304428732..2c1c2cdeb19 100644
--- a/src/mon/Monitor.h
+++ b/src/mon/Monitor.h
@@ -700,6 +700,8 @@ public:
lock.Unlock();
return ret;
}
+ // dissociate message handling from session and connection logic
+ bool dispatch(MonSession *s, Message *m, const bool src_is_mon);
//mon_caps is used for un-connected messages from monitors
MonCap * mon_caps;
bool ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer, bool force_new);
diff --git a/src/mon/MonmapMonitor.cc b/src/mon/MonmapMonitor.cc
index 799f19df154..ca855592445 100644
--- a/src/mon/MonmapMonitor.cc
+++ b/src/mon/MonmapMonitor.cc
@@ -298,20 +298,45 @@ bool MonmapMonitor::prepare_command(MMonCommand *m)
addr.set_port(CEPH_MON_PORT);
}
- if (pending_map.contains(addr) ||
- pending_map.contains(name)) {
+ /**
+ * If we have a monitor with the same name and different addr, then EEXIST
+ * If we have a monitor with the same addr and different name, then EEXIST
+ * If we have a monitor with the same addr and same name, then return as if
+ * we had just added the monitor.
+ * If we don't have the monitor, add it.
+ */
+
+ err = 0;
+ if (!ss.str().empty())
+ ss << "; ";
+
+ do {
+ if (pending_map.contains(addr)) {
+ string n = pending_map.get_name(addr);
+ if (n == name)
+ break;
+ } else if (pending_map.contains(name)) {
+ entity_addr_t tmp_addr = pending_map.get_addr(name);
+ if (tmp_addr == addr)
+ break;
+ } else {
+ break;
+ }
err = -EEXIST;
- if (!ss.str().empty())
- ss << "; ";
- ss << "mon " << name << " " << addr << " already exists";
+ ss << "mon." << name << " at " << addr << " already exists";
+ goto out;
+ } while (false);
+
+ ss << "added mon." << name << " at " << addr;
+ if (pending_map.contains(name)) {
goto out;
}
pending_map.add(name, addr);
pending_map.last_changed = ceph_clock_now(g_ceph_context);
- ss << "added mon." << name << " at " << addr;
getline(ss, rs);
- wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs, get_last_committed()));
+ wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs,
+ get_last_committed()));
return true;
} else if (prefix == "mon remove") {
diff --git a/src/mon/PGMap.cc b/src/mon/PGMap.cc
index e9a35c6b8ab..ea70bbd61c3 100644
--- a/src/mon/PGMap.cc
+++ b/src/mon/PGMap.cc
@@ -30,7 +30,7 @@ void PGMap::Incremental::encode(bufferlist &bl, uint64_t features) const
return;
}
- ENCODE_START(6, 5, bl);
+ ENCODE_START(7, 5, bl);
::encode(version, bl);
::encode(pg_stat_updates, bl);
::encode(osd_stat_updates, bl);
@@ -41,6 +41,7 @@ void PGMap::Incremental::encode(bufferlist &bl, uint64_t features) const
::encode(nearfull_ratio, bl);
::encode(pg_remove, bl);
::encode(stamp, bl);
+ ::encode(osd_epochs, bl);
ENCODE_FINISH(bl);
}
@@ -89,6 +90,17 @@ void PGMap::Incremental::decode(bufferlist::iterator &bl)
}
if (struct_v >= 6)
::decode(stamp, bl);
+ if (struct_v >= 7) {
+ ::decode(osd_epochs, bl);
+ } else {
+ for (map<int32_t, osd_stat_t>::iterator i = osd_stat_updates.begin();
+ i != osd_stat_updates.end();
+ ++i) {
+ // This isn't accurate, but will cause trimming to behave like
+ // previously.
+ osd_epochs.insert(make_pair(i->first, osdmap_epoch));
+ }
+ }
DECODE_FINISH(bl);
}
@@ -140,6 +152,7 @@ void PGMap::Incremental::generate_test_instances(list<PGMap::Incremental*>& o)
o.back()->version = 2;
o.back()->pg_stat_updates[pg_t(1,2,3)] = pg_stat_t();
o.back()->osd_stat_updates[5] = osd_stat_t();
+ o.back()->osd_epochs[5] = 12;
o.push_back(new Incremental);
o.back()->version = 3;
o.back()->osdmap_epoch = 1;
@@ -148,6 +161,7 @@ void PGMap::Incremental::generate_test_instances(list<PGMap::Incremental*>& o)
o.back()->nearfull_ratio = .3;
o.back()->pg_stat_updates[pg_t(4,5,6)] = pg_stat_t();
o.back()->osd_stat_updates[6] = osd_stat_t();
+ o.back()->osd_epochs[6] = 12;
o.back()->pg_remove.insert(pg_t(1,2,3));
o.back()->osd_stat_rm.insert(5);
}
@@ -195,8 +209,10 @@ void PGMap::apply_incremental(CephContext *cct, const Incremental& inc)
}
stat_pg_add(update_pg, update_stat);
}
- for (map<int32_t,osd_stat_t>::const_iterator p = inc.osd_stat_updates.begin();
- p != inc.osd_stat_updates.end();
+ assert(osd_stat.size() == osd_epochs.size());
+ for (map<int32_t,osd_stat_t>::const_iterator p =
+ inc.get_osd_stat_updates().begin();
+ p != inc.get_osd_stat_updates().end();
++p) {
int osd = p->first;
const osd_stat_t &new_stats(p->second);
@@ -209,6 +225,8 @@ void PGMap::apply_incremental(CephContext *cct, const Incremental& inc)
stat_osd_sub(t->second);
t->second = new_stats;
}
+ assert(inc.get_osd_epochs().find(osd) != inc.get_osd_epochs().end());
+ osd_epochs.insert(*(inc.get_osd_epochs().find(osd)));
stat_osd_add(new_stats);
@@ -226,8 +244,8 @@ void PGMap::apply_incremental(CephContext *cct, const Incremental& inc)
}
}
- for (set<int>::iterator p = inc.osd_stat_rm.begin();
- p != inc.osd_stat_rm.end();
+ for (set<int>::iterator p = inc.get_osd_stat_rm().begin();
+ p != inc.get_osd_stat_rm().end();
++p) {
hash_map<int32_t,osd_stat_t>::iterator t = osd_stat.find(*p);
if (t != osd_stat.end()) {
@@ -416,6 +434,14 @@ epoch_t PGMap::calc_min_last_epoch_clean() const
if (lec < min)
min = lec;
}
+ // also scan osd epochs
+ // don't trim past the oldest reported osd epoch
+ for (hash_map<int32_t, epoch_t>::const_iterator i = osd_epochs.begin();
+ i != osd_epochs.end();
+ ++i) {
+ if (i->second < min)
+ min = i->second;
+ }
return min;
}
@@ -434,7 +460,7 @@ void PGMap::encode(bufferlist &bl, uint64_t features) const
return;
}
- ENCODE_START(5, 4, bl);
+ ENCODE_START(6, 4, bl);
::encode(version, bl);
::encode(pg_stat, bl);
::encode(osd_stat, bl);
@@ -443,6 +469,7 @@ void PGMap::encode(bufferlist &bl, uint64_t features) const
::encode(full_ratio, bl);
::encode(nearfull_ratio, bl);
::encode(stamp, bl);
+ ::encode(osd_epochs, bl);
ENCODE_FINISH(bl);
}
@@ -472,6 +499,17 @@ void PGMap::decode(bufferlist::iterator &bl)
}
if (struct_v >= 5)
::decode(stamp, bl);
+ if (struct_v >= 6) {
+ ::decode(osd_epochs, bl);
+ } else {
+ for (hash_map<int32_t, osd_stat_t>::iterator i = osd_stat.begin();
+ i != osd_stat.end();
+ ++i) {
+ // This isn't accurate, but will cause trimming to behave like
+ // previously.
+ osd_epochs.insert(make_pair(i->first, last_osdmap_epoch));
+ }
+ }
DECODE_FINISH(bl);
calc_stats();
@@ -488,7 +526,10 @@ void PGMap::dirty_all(Incremental& inc)
inc.pg_stat_updates[p->first] = p->second;
}
for (hash_map<int32_t, osd_stat_t>::const_iterator p = osd_stat.begin(); p != osd_stat.end(); ++p) {
- inc.osd_stat_updates[p->first] = p->second;
+ assert(inc.get_osd_epochs().count(p->first));
+ inc.update_stat(p->first,
+ inc.get_osd_epochs().find(p->first)->second,
+ p->second);
}
}
@@ -701,7 +742,8 @@ void PGMap::dump_stuck_plain(ostream& ss, PGMap::StuckPG type, utime_t cutoff) c
{
hash_map<pg_t, pg_stat_t> stuck_pg_stats;
get_stuck_stats(type, cutoff, stuck_pg_stats);
- dump_pg_stats_plain(ss, stuck_pg_stats);
+ if (!stuck_pg_stats.empty())
+ dump_pg_stats_plain(ss, stuck_pg_stats);
}
void PGMap::dump_osd_perf_stats(Formatter *f) const
diff --git a/src/mon/PGMap.h b/src/mon/PGMap.h
index 84d89f87517..7a202fc0006 100644
--- a/src/mon/PGMap.h
+++ b/src/mon/PGMap.h
@@ -43,12 +43,13 @@ public:
float full_ratio;
float nearfull_ratio;
+ // mapping of osd to most recently reported osdmap epoch
+ hash_map<int32_t,epoch_t> osd_epochs;
+
class Incremental {
public:
version_t version;
map<pg_t,pg_stat_t> pg_stat_updates;
- map<int32_t,osd_stat_t> osd_stat_updates;
- set<int32_t> osd_stat_rm;
epoch_t osdmap_epoch;
epoch_t pg_scan; // osdmap epoch
set<pg_t> pg_remove;
@@ -56,6 +57,38 @@ public:
float nearfull_ratio;
utime_t stamp;
+ private:
+ map<int32_t,osd_stat_t> osd_stat_updates;
+ set<int32_t> osd_stat_rm;
+
+ // mapping of osd to most recently reported osdmap epoch
+ map<int32_t,epoch_t> osd_epochs;
+ public:
+
+ const map<int32_t, osd_stat_t> &get_osd_stat_updates() const {
+ return osd_stat_updates;
+ }
+ const set<int32_t> &get_osd_stat_rm() const {
+ return osd_stat_rm;
+ }
+ const map<int32_t, epoch_t> &get_osd_epochs() const {
+ return osd_epochs;
+ }
+
+ void update_stat(int32_t osd, epoch_t epoch, const osd_stat_t &stat) {
+ osd_stat_updates[osd] = stat;
+ osd_epochs[osd] = epoch;
+ assert(osd_epochs.size() == osd_stat_updates.size());
+ }
+ void stat_osd_out(int32_t osd) {
+ // 0 the stats for the osd
+ osd_stat_updates[osd] = osd_stat_t();
+ }
+ void rm_stat(int32_t osd) {
+ osd_stat_rm.insert(osd);
+ osd_epochs.erase(osd);
+ osd_stat_updates.erase(osd);
+ }
void encode(bufferlist &bl, uint64_t features=-1) const;
void decode(bufferlist::iterator &bl);
void dump(Formatter *f) const;
diff --git a/src/mon/PGMonitor.cc b/src/mon/PGMonitor.cc
index 0f495052747..0644922ddb4 100644
--- a/src/mon/PGMonitor.cc
+++ b/src/mon/PGMonitor.cc
@@ -494,15 +494,19 @@ void PGMonitor::encode_pending(MonitorDBStore::Transaction *t)
{
bufferlist dirty;
string prefix = pgmap_osd_prefix;
- for (map<int32_t,osd_stat_t>::const_iterator p = pending_inc.osd_stat_updates.begin();
- p != pending_inc.osd_stat_updates.end();
+ for (map<int32_t,osd_stat_t>::const_iterator p =
+ pending_inc.get_osd_stat_updates().begin();
+ p != pending_inc.get_osd_stat_updates().end();
++p) {
::encode(p->first, dirty);
bufferlist bl;
::encode(p->second, bl, features);
t->put(prefix, stringify(p->first), bl);
}
- for (set<int32_t>::const_iterator p = pending_inc.osd_stat_rm.begin(); p != pending_inc.osd_stat_rm.end(); ++p) {
+ for (set<int32_t>::const_iterator p =
+ pending_inc.get_osd_stat_rm().begin();
+ p != pending_inc.get_osd_stat_rm().end();
+ ++p) {
::encode(*p, dirty);
t->erase(prefix, stringify(*p));
}
@@ -725,7 +729,11 @@ bool PGMonitor::prepare_pg_stats(MPGStats *stats)
}
// osd stat
- pending_inc.osd_stat_updates[from] = stats->osd_stat;
+ if (mon->osdmon()->osdmap.is_in(from)) {
+ pending_inc.update_stat(from, stats->epoch, stats->osd_stat);
+ } else {
+ pending_inc.update_stat(from, stats->epoch, osd_stat_t());
+ }
if (pg_map.osd_stat.count(from))
dout(10) << " got osd." << from << " " << stats->osd_stat << " (was " << pg_map.osd_stat[from] << ")" << dendl;
@@ -842,11 +850,7 @@ void PGMonitor::check_osd_map(epoch_t epoch)
++p)
if (p->second == CEPH_OSD_OUT) {
dout(10) << "check_osd_map osd." << p->first << " went OUT" << dendl;
- pending_inc.osd_stat_rm.insert(p->first);
- } else {
- dout(10) << "check_osd_map osd." << p->first << " is IN" << dendl;
- pending_inc.osd_stat_rm.erase(p->first);
- pending_inc.osd_stat_updates[p->first];
+ pending_inc.stat_osd_out(p->first);
}
// this is conservative: we want to know if any osds (maybe) got marked down.
@@ -867,7 +871,7 @@ void PGMonitor::check_osd_map(epoch_t epoch)
// whether it was created *or* destroyed, we can safely drop
// it's osd_stat_t record.
dout(10) << "check_osd_map osd." << p->first << " created or destroyed" << dendl;
- pending_inc.osd_stat_rm.insert(p->first);
+ pending_inc.rm_stat(p->first);
// and adjust full, nearfull set
pg_map.nearfull_osds.erase(p->first);
diff --git a/src/os/FileStore.cc b/src/os/FileStore.cc
index cd8a8e50658..514ff022bee 100644
--- a/src/os/FileStore.cc
+++ b/src/os/FileStore.cc
@@ -422,7 +422,10 @@ FileStore::FileStore(const std::string &base, const std::string &jdev, const cha
m_filestore_do_dump(false),
m_filestore_dump_fmt(true),
m_filestore_sloppy_crc(g_conf->filestore_sloppy_crc),
- m_filestore_sloppy_crc_block_size(g_conf->filestore_sloppy_crc_block_size)
+ m_filestore_sloppy_crc_block_size(g_conf->filestore_sloppy_crc_block_size),
+ m_fs_type(FS_TYPE_NONE),
+ m_filestore_max_inline_xattr_size(0),
+ m_filestore_max_inline_xattrs(0)
{
m_filestore_kill_at.set(g_conf->filestore_kill_at);
@@ -825,12 +828,14 @@ int FileStore::_detect_fs()
blk_size = st.f_bsize;
+ m_fs_type = FS_TYPE_OTHER;
#if defined(__linux__)
if (st.f_type == BTRFS_SUPER_MAGIC) {
dout(0) << "mount detected btrfs" << dendl;
backend = new BtrfsFileStoreBackend(this);
wbthrottle.set_fs(WBThrottle::BTRFS);
+ m_fs_type = FS_TYPE_BTRFS;
} else if (st.f_type == XFS_SUPER_MAGIC) {
dout(1) << "mount detected xfs" << dendl;
if (m_filestore_replica_fadvise) {
@@ -838,15 +843,19 @@ int FileStore::_detect_fs()
g_conf->set_val("filestore_replica_fadvise", "false");
g_conf->apply_changes(NULL);
assert(m_filestore_replica_fadvise == false);
+ m_fs_type = FS_TYPE_XFS;
}
}
#endif
#ifdef HAVE_LIBZFS
if (st.f_type == ZFS_SUPER_MAGIC) {
backend = new ZFSFileStoreBackend(this);
+ m_fs_type = FS_TYPE_ZFS;
}
#endif
+ set_xattr_limits_via_conf();
+
r = backend->detect_features();
if (r < 0) {
derr << "_detect_fs: detect_features error: " << cpp_strerror(r) << dendl;
@@ -887,14 +896,7 @@ int FileStore::_detect_fs()
chain_fsetxattr(tmpfd, "user.test4", &buf, sizeof(buf));
ret = chain_fsetxattr(tmpfd, "user.test5", &buf, sizeof(buf));
if (ret == -ENOSPC) {
- if (!g_conf->filestore_xattr_use_omap) {
- dout(0) << "limited size xattrs -- automatically enabling filestore_xattr_use_omap" << dendl;
- g_conf->set_val("filestore_xattr_use_omap", "true");
- g_conf->apply_changes(NULL);
- assert(g_conf->filestore_xattr_use_omap == true);
- } else {
- dout(0) << "limited size xattrs -- filestore_xattr_use_omap already enabled" << dendl;
- }
+ dout(0) << "limited size xattrs" << dendl;
}
chain_fremovexattr(tmpfd, "user.test");
chain_fremovexattr(tmpfd, "user.test2");
@@ -3397,7 +3399,7 @@ int FileStore::getattr(coll_t cid, const ghobject_t& oid, const char *name, buff
get_attrname(name, n, CHAIN_XATTR_MAX_NAME_LEN);
r = _fgetattr(**fd, n, bp);
lfn_close(fd);
- if (r == -ENODATA && g_conf->filestore_xattr_use_omap) {
+ if (r == -ENODATA) {
map<string, bufferlist> got;
set<string> to_get;
to_get.insert(string(name));
@@ -3433,6 +3435,9 @@ int FileStore::getattr(coll_t cid, const ghobject_t& oid, const char *name, buff
int FileStore::getattrs(coll_t cid, const ghobject_t& oid, map<string,bufferptr>& aset, bool user_only)
{
+ set<string> omap_attrs;
+ map<string, bufferlist> omap_aset;
+ Index index;
dout(15) << "getattrs " << cid << "/" << oid << dendl;
FDRef fd;
int r = lfn_open(cid, oid, false, &fd);
@@ -3440,43 +3445,41 @@ int FileStore::getattrs(coll_t cid, const ghobject_t& oid, map<string,bufferptr>
goto out;
}
r = _fgetattrs(**fd, aset, user_only);
+ if (r < 0) {
+ goto out;
+ }
lfn_close(fd);
- if (g_conf->filestore_xattr_use_omap) {
- set<string> omap_attrs;
- map<string, bufferlist> omap_aset;
- Index index;
- int r = get_index(cid, &index);
- if (r < 0) {
- dout(10) << __func__ << " could not get index r = " << r << dendl;
- goto out;
- }
- r = object_map->get_all_xattrs(oid, &omap_attrs);
- if (r < 0 && r != -ENOENT) {
- dout(10) << __func__ << " could not get omap_attrs r = " << r << dendl;
- goto out;
- }
- r = object_map->get_xattrs(oid, omap_attrs, &omap_aset);
- if (r < 0 && r != -ENOENT) {
- dout(10) << __func__ << " could not get omap_attrs r = " << r << dendl;
- goto out;
- }
- assert(omap_attrs.size() == omap_aset.size());
- for (map<string, bufferlist>::iterator i = omap_aset.begin();
+ r = get_index(cid, &index);
+ if (r < 0) {
+ dout(10) << __func__ << " could not get index r = " << r << dendl;
+ goto out;
+ }
+ r = object_map->get_all_xattrs(oid, &omap_attrs);
+ if (r < 0 && r != -ENOENT) {
+ dout(10) << __func__ << " could not get omap_attrs r = " << r << dendl;
+ goto out;
+ }
+ r = object_map->get_xattrs(oid, omap_attrs, &omap_aset);
+ if (r < 0 && r != -ENOENT) {
+ dout(10) << __func__ << " could not get omap_attrs r = " << r << dendl;
+ goto out;
+ }
+ assert(omap_attrs.size() == omap_aset.size());
+ for (map<string, bufferlist>::iterator i = omap_aset.begin();
i != omap_aset.end();
++i) {
- string key;
- if (user_only) {
+ string key;
+ if (user_only) {
if (i->first[0] != '_')
continue;
if (i->first == "_")
continue;
key = i->first.substr(1, i->first.size());
- } else {
+ } else {
key = i->first;
- }
- aset.insert(make_pair(key,
- bufferptr(i->second.c_str(), i->second.length())));
}
+ aset.insert(make_pair(key,
+ bufferptr(i->second.c_str(), i->second.length())));
}
out:
dout(10) << "getattrs " << cid << "/" << oid << " = " << r << dendl;
@@ -3502,10 +3505,8 @@ int FileStore::_setattrs(coll_t cid, const ghobject_t& oid, map<string,bufferptr
if (r < 0) {
goto out;
}
- if (g_conf->filestore_xattr_use_omap) {
- r = _fgetattrs(**fd, inline_set, false);
- assert(!m_filestore_fail_eio || r != -EIO);
- }
+ r = _fgetattrs(**fd, inline_set, false);
+ assert(!m_filestore_fail_eio || r != -EIO);
dout(15) << "setattrs " << cid << "/" << oid << dendl;
r = 0;
for (map<string,bufferptr>::iterator p = aset.begin();
@@ -3513,8 +3514,8 @@ int FileStore::_setattrs(coll_t cid, const ghobject_t& oid, map<string,bufferptr
++p) {
char n[CHAIN_XATTR_MAX_NAME_LEN];
get_attrname(p->first.c_str(), n, CHAIN_XATTR_MAX_NAME_LEN);
- if (g_conf->filestore_xattr_use_omap) {
- if (p->second.length() > g_conf->filestore_max_inline_xattr_size) {
+
+ if (p->second.length() > m_filestore_max_inline_xattr_size) {
if (inline_set.count(p->first)) {
inline_set.erase(p->first);
r = chain_fremovexattr(**fd, n);
@@ -3523,10 +3524,10 @@ int FileStore::_setattrs(coll_t cid, const ghobject_t& oid, map<string,bufferptr
}
omap_set[p->first].push_back(p->second);
continue;
- }
+ }
- if (!inline_set.count(p->first) &&
- inline_set.size() >= g_conf->filestore_max_inline_xattrs) {
+ if (!inline_set.count(p->first) &&
+ inline_set.size() >= m_filestore_max_inline_xattrs) {
if (inline_set.count(p->first)) {
inline_set.erase(p->first);
r = chain_fremovexattr(**fd, n);
@@ -3535,10 +3536,9 @@ int FileStore::_setattrs(coll_t cid, const ghobject_t& oid, map<string,bufferptr
}
omap_set[p->first].push_back(p->second);
continue;
- }
- omap_remove.insert(p->first);
- inline_set.insert(*p);
}
+ omap_remove.insert(p->first);
+ inline_set.insert(*p);
inline_to_set.insert(*p);
@@ -3549,17 +3549,17 @@ int FileStore::_setattrs(coll_t cid, const ghobject_t& oid, map<string,bufferptr
goto out_close;
if (!omap_remove.empty()) {
- assert(g_conf->filestore_xattr_use_omap);
r = object_map->remove_xattrs(oid, omap_remove, &spos);
if (r < 0 && r != -ENOENT) {
dout(10) << __func__ << " could not remove_xattrs r = " << r << dendl;
assert(!m_filestore_fail_eio || r != -EIO);
goto out_close;
+ } else {
+ r = 0; // don't confuse the debug output
}
}
if (!omap_set.empty()) {
- assert(g_conf->filestore_xattr_use_omap);
r = object_map->set_xattrs(oid, omap_set, &spos);
if (r < 0) {
dout(10) << __func__ << " could not set_xattrs r = " << r << dendl;
@@ -3587,7 +3587,7 @@ int FileStore::_rmattr(coll_t cid, const ghobject_t& oid, const char *name,
char n[CHAIN_XATTR_MAX_NAME_LEN];
get_attrname(name, n, CHAIN_XATTR_MAX_NAME_LEN);
r = chain_fremovexattr(**fd, n);
- if (r == -ENODATA && g_conf->filestore_xattr_use_omap) {
+ if (r == -ENODATA) {
Index index;
r = get_index(cid, &index);
if (r < 0) {
@@ -3617,6 +3617,8 @@ int FileStore::_rmattrs(coll_t cid, const ghobject_t& oid,
map<string,bufferptr> aset;
FDRef fd;
+ set<string> omap_attrs;
+ Index index;
int r = lfn_open(cid, oid, false, &fd);
if (r < 0) {
goto out;
@@ -3633,25 +3635,21 @@ int FileStore::_rmattrs(coll_t cid, const ghobject_t& oid,
}
lfn_close(fd);
- if (g_conf->filestore_xattr_use_omap) {
- set<string> omap_attrs;
- Index index;
- r = get_index(cid, &index);
- if (r < 0) {
- dout(10) << __func__ << " could not get index r = " << r << dendl;
- return r;
- }
- r = object_map->get_all_xattrs(oid, &omap_attrs);
- if (r < 0 && r != -ENOENT) {
- dout(10) << __func__ << " could not get omap_attrs r = " << r << dendl;
- assert(!m_filestore_fail_eio || r != -EIO);
- return r;
- }
- r = object_map->remove_xattrs(oid, omap_attrs, &spos);
- if (r < 0 && r != -ENOENT) {
- dout(10) << __func__ << " could not remove omap_attrs r = " << r << dendl;
- return r;
- }
+ r = get_index(cid, &index);
+ if (r < 0) {
+ dout(10) << __func__ << " could not get index r = " << r << dendl;
+ return r;
+ }
+ r = object_map->get_all_xattrs(oid, &omap_attrs);
+ if (r < 0 && r != -ENOENT) {
+ dout(10) << __func__ << " could not get omap_attrs r = " << r << dendl;
+ assert(!m_filestore_fail_eio || r != -EIO);
+ return r;
+ }
+ r = object_map->remove_xattrs(oid, omap_attrs, &spos);
+ if (r < 0 && r != -ENOENT) {
+ dout(10) << __func__ << " could not remove omap_attrs r = " << r << dendl;
+ return r;
}
out:
dout(10) << "rmattrs " << cid << "/" << oid << " = " << r << dendl;
@@ -4560,6 +4558,17 @@ const char** FileStore::get_tracked_conf_keys() const
void FileStore::handle_conf_change(const struct md_config_t *conf,
const std::set <std::string> &changed)
{
+ if (changed.count("filestore_max_inline_xattr_size") ||
+ changed.count("filestore_max_inline_xattr_size_xfs") ||
+ changed.count("filestore_max_inline_xattr_size_btrfs") ||
+ changed.count("filestore_max_inline_xattr_size_other") ||
+ changed.count("filestore_max_inline_xattrs") ||
+ changed.count("filestore_max_inline_xattrs_xfs") ||
+ changed.count("filestore_max_inline_xattrs_btrfs") ||
+ changed.count("filestore_max_inline_xattrs_other")) {
+ Mutex::Locker l(lock);
+ set_xattr_limits_via_conf();
+ }
if (changed.count("filestore_min_sync_interval") ||
changed.count("filestore_max_sync_interval") ||
changed.count("filestore_queue_max_ops") ||
@@ -4639,6 +4648,44 @@ void FileStore::dump_transactions(list<ObjectStore::Transaction*>& ls, uint64_t
m_filestore_dump.flush();
}
+void FileStore::set_xattr_limits_via_conf()
+{
+ uint32_t fs_xattr_size;
+ uint32_t fs_xattrs;
+
+ assert(m_fs_type != FS_TYPE_NONE);
+
+ switch(m_fs_type) {
+ case FS_TYPE_XFS:
+ fs_xattr_size = g_conf->filestore_max_inline_xattr_size_xfs;
+ fs_xattrs = g_conf->filestore_max_inline_xattrs_xfs;
+ break;
+ case FS_TYPE_BTRFS:
+ fs_xattr_size = g_conf->filestore_max_inline_xattr_size_btrfs;
+ fs_xattrs = g_conf->filestore_max_inline_xattrs_btrfs;
+ break;
+ case FS_TYPE_ZFS:
+ case FS_TYPE_OTHER:
+ fs_xattr_size = g_conf->filestore_max_inline_xattr_size_other;
+ fs_xattrs = g_conf->filestore_max_inline_xattrs_other;
+ break;
+ default:
+ assert(!"Unknown fs type");
+ }
+
+ //Use override value if set
+ if (g_conf->filestore_max_inline_xattr_size)
+ m_filestore_max_inline_xattr_size = g_conf->filestore_max_inline_xattr_size;
+ else
+ m_filestore_max_inline_xattr_size = fs_xattr_size;
+
+ //Use override value if set
+ if (g_conf->filestore_max_inline_xattrs)
+ m_filestore_max_inline_xattrs = g_conf->filestore_max_inline_xattrs;
+ else
+ m_filestore_max_inline_xattrs = fs_xattrs;
+}
+
// -- FSSuperblock --
void FSSuperblock::encode(bufferlist &bl) const
diff --git a/src/os/FileStore.h b/src/os/FileStore.h
index fdab0ece34f..c489fdd5796 100644
--- a/src/os/FileStore.h
+++ b/src/os/FileStore.h
@@ -64,6 +64,14 @@ static const __SWORD_TYPE XFS_SUPER_MAGIC(0x58465342);
static const __SWORD_TYPE ZFS_SUPER_MAGIC(0x2fc12fc1);
#endif
+enum fs_types {
+ FS_TYPE_NONE = 0,
+ FS_TYPE_XFS,
+ FS_TYPE_BTRFS,
+ FS_TYPE_ZFS,
+ FS_TYPE_OTHER
+};
+
class FileStoreBackend;
#define CEPH_FS_FEATURE_INCOMPAT_SHARDS CompatSet::Feature(1, "sharded objects")
@@ -593,6 +601,13 @@ private:
atomic_t m_filestore_kill_at;
bool m_filestore_sloppy_crc;
int m_filestore_sloppy_crc_block_size;
+ enum fs_types m_fs_type;
+
+ //Determined xattr handling based on fs type
+ void set_xattr_limits_via_conf();
+ uint32_t m_filestore_max_inline_xattr_size;
+ uint32_t m_filestore_max_inline_xattrs;
+
FSSuperblock superblock;
/**
diff --git a/src/osd/PG.cc b/src/osd/PG.cc
index 37661a01ea5..8f7d3ccb684 100644
--- a/src/osd/PG.cc
+++ b/src/osd/PG.cc
@@ -1,4 +1,3 @@
-
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab
/*
@@ -1997,8 +1996,7 @@ void PG::upgrade(ObjectStore *store, const interval_set<snapid_t> &snapcolls)
hobject_t cur;
vector<hobject_t> objects;
while (1) {
- int r = store->collection_list_partial(
- cid,
+ int r = get_pgbackend()->objects_list_partial(
cur,
store->get_ideal_list_min(),
store->get_ideal_list_max(),
@@ -2046,8 +2044,7 @@ void PG::upgrade(ObjectStore *store, const interval_set<snapid_t> &snapcolls)
while (1) {
dout(1) << "Updating snap_mapper from main collection, "
<< done << " objects done" << dendl;
- int r = store->collection_list_partial(
- cid,
+ int r = get_pgbackend()->objects_list_partial(
cur,
store->get_ideal_list_min(),
store->get_ideal_list_max(),
@@ -2070,19 +2067,16 @@ void PG::upgrade(ObjectStore *store, const interval_set<snapid_t> &snapcolls)
++j) {
if (j->snap < CEPH_MAXSNAP) {
OSDriver::OSTransaction _t(osdriver.get_transaction(&t));
- bufferptr bp;
- r = store->getattr(
- cid,
+ bufferlist bl;
+ r = get_pgbackend()->objects_get_attr(
*j,
OI_ATTR,
- bp);
+ &bl);
if (r < 0) {
derr << __func__ << ": getattr returned "
<< cpp_strerror(r) << dendl;
assert(0);
}
- bufferlist bl;
- bl.push_back(bp);
object_info_t oi(bl);
set<snapid_t> oi_snaps(oi.snaps.begin(), oi.snaps.end());
set<snapid_t> cur_snaps;
@@ -2412,9 +2406,8 @@ void PG::log_weirdness()
<< " log bound mismatch, empty but (" << pg_log.get_tail() << ","
<< pg_log.get_head() << "]\n";
} else {
- if ((pg_log.get_log().log.begin()->version <= pg_log.get_tail()) || // sloppy check
- (pg_log.get_log().log.rbegin()->version != pg_log.get_head() &&
- !(pg_log.get_head() == pg_log.get_tail())))
+ // sloppy check
+ if ((pg_log.get_log().log.begin()->version <= pg_log.get_tail()))
osd->clog.error() << info.pgid
<< " log bound mismatch, info (" << pg_log.get_tail() << ","
<< pg_log.get_head() << "]"
@@ -3039,9 +3032,9 @@ int PG::build_scrub_map_chunk(
// objects
vector<hobject_t> ls;
- int ret = osd->store->collection_list_range(coll, start, end, 0, &ls);
+ int ret = get_pgbackend()->objects_list_range(start, end, 0, &ls);
if (ret < 0) {
- dout(5) << "collection_list_range error: " << ret << dendl;
+ dout(5) << "objects_list_range error: " << ret << dendl;
return ret;
}
@@ -3561,11 +3554,13 @@ void PG::chunky_scrub(ThreadPool::TPHandle &handle)
hobject_t start = scrubber.start;
while (!boundary_found) {
vector<hobject_t> objects;
- ret = osd->store->collection_list_partial(coll, start,
- cct->_conf->osd_scrub_chunk_min,
- cct->_conf->osd_scrub_chunk_max,
- 0,
- &objects, &scrubber.end);
+ ret = get_pgbackend()->objects_list_partial(
+ start,
+ cct->_conf->osd_scrub_chunk_min,
+ cct->_conf->osd_scrub_chunk_max,
+ 0,
+ &objects,
+ &scrubber.end);
assert(ret >= 0);
// in case we don't find a boundary: start again at the end
diff --git a/src/osd/PG.h b/src/osd/PG.h
index 74809eea268..275d30c7658 100644
--- a/src/osd/PG.h
+++ b/src/osd/PG.h
@@ -48,6 +48,7 @@
#include "common/WorkQueue.h"
#include "common/ceph_context.h"
#include "include/str_list.h"
+#include "PGBackend.h"
#include <list>
#include <memory>
@@ -193,6 +194,8 @@ protected:
CephContext *cct;
OSDriver osdriver;
SnapMapper snap_mapper;
+
+ virtual PGBackend *get_pgbackend() = 0;
public:
void update_snap_mapper_bits(uint32_t bits) {
snap_mapper.update_bits(bits);
@@ -439,6 +442,7 @@ protected:
*/
struct BackfillInterval {
// info about a backfill interval on a peer
+ eversion_t version; /// version at which the scan occurred
map<hobject_t,eversion_t> objects;
hobject_t begin;
hobject_t end;
@@ -447,6 +451,7 @@ protected:
void clear() {
objects.clear();
begin = end = hobject_t();
+ version = eversion_t();
}
void reset(hobject_t start) {
diff --git a/src/osd/PGBackend.h b/src/osd/PGBackend.h
index e3cc05bf345..408c589a08a 100644
--- a/src/osd/PGBackend.h
+++ b/src/osd/PGBackend.h
@@ -205,6 +205,26 @@
virtual void clear_temp_obj(const hobject_t &oid) = 0;
virtual ~PGBackend() {}
+
+ /// List objects in collection
+ virtual int objects_list_partial(
+ const hobject_t &begin,
+ int min,
+ int max,
+ snapid_t seq,
+ vector<hobject_t> *ls,
+ hobject_t *next) = 0;
+
+ virtual int objects_list_range(
+ const hobject_t &start,
+ const hobject_t &end,
+ snapid_t seq,
+ vector<hobject_t> *ls) = 0;
+
+ virtual int objects_get_attr(
+ const hobject_t &hoid,
+ const string &attr,
+ bufferlist *out) = 0;
};
#endif
diff --git a/src/osd/PGLog.cc b/src/osd/PGLog.cc
index 6e025f289bc..1949c96fd57 100644
--- a/src/osd/PGLog.cc
+++ b/src/osd/PGLog.cc
@@ -52,13 +52,9 @@ void PGLog::IndexedLog::split_into(
if (log.empty())
tail = head;
- else
- head = log.rbegin()->version;
if (olog->empty())
olog->tail = olog->head;
- else
- olog->head = olog->log.rbegin()->version;
olog->index();
index();
diff --git a/src/osd/ReplicatedBackend.cc b/src/osd/ReplicatedBackend.cc
index b39207e14f8..9529e15ae77 100644
--- a/src/osd/ReplicatedBackend.cc
+++ b/src/osd/ReplicatedBackend.cc
@@ -194,3 +194,75 @@ void ReplicatedBackend::on_flushed()
assert(0 == "found garbage in the temp collection");
}
}
+
+
+int ReplicatedBackend::objects_list_partial(
+ const hobject_t &begin,
+ int min,
+ int max,
+ snapid_t seq,
+ vector<hobject_t> *ls,
+ hobject_t *next)
+{
+ vector<ghobject_t> objects;
+ ghobject_t _next;
+ int r = osd->store->collection_list_partial(
+ coll,
+ begin,
+ min,
+ max,
+ seq,
+ &objects,
+ &_next);
+ ls->reserve(objects.size());
+ for (vector<ghobject_t>::iterator i = objects.begin();
+ i != objects.end();
+ ++i) {
+ assert(i->is_degenerate());
+ ls->push_back(i->hobj);
+ }
+ assert(_next.is_degenerate());
+ *next = _next.hobj;
+ return r;
+}
+
+int ReplicatedBackend::objects_list_range(
+ const hobject_t &start,
+ const hobject_t &end,
+ snapid_t seq,
+ vector<hobject_t> *ls)
+{
+ vector<ghobject_t> objects;
+ int r = osd->store->collection_list_range(
+ coll,
+ start,
+ end,
+ seq,
+ &objects);
+ ls->reserve(objects.size());
+ for (vector<ghobject_t>::iterator i = objects.begin();
+ i != objects.end();
+ ++i) {
+ assert(i->is_degenerate());
+ ls->push_back(i->hobj);
+ }
+ return r;
+}
+
+int ReplicatedBackend::objects_get_attr(
+ const hobject_t &hoid,
+ const string &attr,
+ bufferlist *out)
+{
+ bufferptr bp;
+ int r = osd->store->getattr(
+ coll,
+ hoid,
+ attr.c_str(),
+ bp);
+ if (r >= 0 && out) {
+ out->clear();
+ out->push_back(bp);
+ }
+ return r;
+}
diff --git a/src/osd/ReplicatedBackend.h b/src/osd/ReplicatedBackend.h
index e34e55a618e..cc5f060e136 100644
--- a/src/osd/ReplicatedBackend.h
+++ b/src/osd/ReplicatedBackend.h
@@ -148,6 +148,26 @@ public:
f->close_section();
}
}
+
+ /// List objects in collection
+ int objects_list_partial(
+ const hobject_t &begin,
+ int min,
+ int max,
+ snapid_t seq,
+ vector<hobject_t> *ls,
+ hobject_t *next);
+
+ int objects_list_range(
+ const hobject_t &start,
+ const hobject_t &end,
+ snapid_t seq,
+ vector<hobject_t> *ls);
+
+ int objects_get_attr(
+ const hobject_t &hoid,
+ const string &attr,
+ bufferlist *out);
private:
// push
struct PushInfo {
diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc
index 1e2a863e389..93540f4fc81 100644
--- a/src/osd/ReplicatedPG.cc
+++ b/src/osd/ReplicatedPG.cc
@@ -398,8 +398,10 @@ bool PGLSPlainFilter::filter(bufferlist& xattr_data, bufferlist& outdata)
bool ReplicatedPG::pgls_filter(PGLSFilter *filter, hobject_t& sobj, bufferlist& outdata)
{
bufferlist bl;
-
- int ret = osd->store->getattr(coll_t(info.pgid), sobj, filter->get_xattr().c_str(), bl);
+ int ret = pgbackend->objects_get_attr(
+ sobj,
+ filter->get_xattr(),
+ &bl);
dout(0) << "getattr (sobj=" << sobj << ", attr=" << filter->get_xattr() << ") returned " << ret << dendl;
if (ret < 0)
return false;
@@ -639,12 +641,13 @@ void ReplicatedPG::do_pg_op(OpRequestRef op)
hobject_t next;
hobject_t current = response.handle;
osr->flush();
- int r = osd->store->collection_list_partial(coll, current,
- list_size,
- list_size,
- snapid,
- &sentries,
- &next);
+ int r = pgbackend->objects_list_partial(
+ current,
+ list_size,
+ list_size,
+ snapid,
+ &sentries,
+ &next);
if (r != 0) {
result = -EINVAL;
break;
@@ -682,13 +685,17 @@ void ReplicatedPG::do_pg_op(OpRequestRef op)
if (snapid != CEPH_NOSNAP) {
bufferlist bl;
if (candidate.snap == CEPH_NOSNAP) {
- osd->store->getattr(coll, candidate, SS_ATTR, bl);
+ pgbackend->objects_get_attr(
+ candidate,
+ SS_ATTR,
+ &bl);
SnapSet snapset(bl);
if (snapid <= snapset.seq)
continue;
} else {
bufferlist attr_bl;
- osd->store->getattr(coll, candidate, OI_ATTR, attr_bl);
+ pgbackend->objects_get_attr(
+ candidate, OI_ATTR, &attr_bl);
object_info_t oi(attr_bl);
vector<snapid_t>::iterator i = find(oi.snaps.begin(),
oi.snaps.end(),
@@ -1536,8 +1543,9 @@ void ReplicatedPG::do_scan(
BackfillInterval bi;
osr->flush();
+ bi.begin = m->begin;
scan_range(
- m->begin, cct->_conf->osd_backfill_scan_min,
+ cct->_conf->osd_backfill_scan_min,
cct->_conf->osd_backfill_scan_max, &bi, handle);
MOSDPGScan *reply = new MOSDPGScan(MOSDPGScan::OP_SCAN_DIGEST,
get_osdmap()->get_epoch(), m->query_epoch,
@@ -2659,7 +2667,10 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
string aname;
bp.copy(op.xattr.name_len, aname);
string name = "_" + aname;
- int r = osd->store->getattr(coll, soid, name.c_str(), osd_op.outdata);
+ int r = pgbackend->objects_get_attr(
+ soid,
+ name,
+ &(osd_op.outdata));
if (r >= 0) {
op.xattr.value_len = r;
result = 0;
@@ -2702,9 +2713,15 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
bufferlist xattr;
if (op.op == CEPH_OSD_OP_CMPXATTR)
- result = osd->store->getattr(coll, soid, name.c_str(), xattr);
+ result = pgbackend->objects_get_attr(
+ soid,
+ name,
+ &xattr);
else
- result = osd->store->getattr(coll, src_obc->obs.oi.soid, name.c_str(), xattr);
+ result = pgbackend->objects_get_attr(
+ src_obc->obs.oi.soid,
+ name,
+ &xattr);
if (result < 0 && result != -EEXIST && result != -ENODATA)
break;
@@ -3675,7 +3692,7 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
result = -EINVAL;
goto fail;
}
- if (!ctx->copy_op) {
+ if (!ctx->copy_cb) {
// start
pg_t raw_pg;
get_osdmap()->object_locator_to_pg(src_name, src_oloc, raw_pg);
@@ -3687,13 +3704,18 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
result = -EINVAL;
break;
}
- result = start_copy(ctx, src, src_oloc, src_version);
+ hobject_t temp_target = generate_temp_object();
+ CopyFromCallback *cb = new CopyFromCallback(ctx, temp_target);
+ ctx->copy_cb = cb;
+ result = start_copy(cb, ctx->obc, src, src_oloc, src_version,
+ temp_target);
if (result < 0)
goto fail;
result = -EINPROGRESS;
} else {
// finish
- result = finish_copy(ctx);
+ assert(ctx->copy_cb->get_result() >= 0);
+ result = finish_copyfrom(ctx);
}
}
break;
@@ -3785,37 +3807,35 @@ int ReplicatedPG::_rollback_to(OpContext *ctx, ceph_osd_op& op)
int ret = find_object_context(
hobject_t(soid.oid, soid.get_key(), snapid, soid.hash, info.pgid.pool(), soid.get_namespace()),
&rollback_to, false, &cloneid);
- if (ret) {
- if (-ENOENT == ret || rollback_to->obs.oi.is_whiteout()) {
- // there's no snapshot here, or there's no object.
- // if there's no snapshot, we delete the object; otherwise, do nothing.
- dout(20) << "_rollback_to deleting head on " << soid.oid
- << " because got ENOENT|whiteout on find_object_context" << dendl;
- if (ctx->obc->obs.oi.watchers.size()) {
- // Cannot delete an object with watchers
- ret = -EBUSY;
- } else {
- _delete_head(ctx);
- ret = 0;
- }
- } else if (-EAGAIN == ret) {
- /* a different problem, like degraded pool
- * with not-yet-restored object. We shouldn't have been able
- * to get here; recovery should have completed first! */
- hobject_t rollback_target(soid.oid, soid.get_key(), cloneid, soid.hash,
- info.pgid.pool(), soid.get_namespace());
- assert(is_missing_object(rollback_target));
- dout(20) << "_rollback_to attempted to roll back to a missing object "
- << rollback_target << " (requested snapid: ) " << snapid << dendl;
- wait_for_missing_object(rollback_target, ctx->op);
+ if (ret == -ENOENT || (rollback_to && rollback_to->obs.oi.is_whiteout())) {
+ // there's no snapshot here, or there's no object.
+ // if there's no snapshot, we delete the object; otherwise, do nothing.
+ dout(20) << "_rollback_to deleting head on " << soid.oid
+ << " because got ENOENT|whiteout on find_object_context" << dendl;
+ if (ctx->obc->obs.oi.watchers.size()) {
+ // Cannot delete an object with watchers
+ ret = -EBUSY;
} else {
- // ummm....huh? It *can't* return anything else at time of writing.
- assert(0);
- }
+ _delete_head(ctx);
+ ret = 0;
+ }
+ } else if (-EAGAIN == ret) {
+ /* a different problem, like degraded pool
+ * with not-yet-restored object. We shouldn't have been able
+ * to get here; recovery should have completed first! */
+ hobject_t rollback_target(soid.oid, soid.get_key(), cloneid, soid.hash,
+ info.pgid.pool(), soid.get_namespace());
+ assert(is_missing_object(rollback_target));
+ dout(20) << "_rollback_to attempted to roll back to a missing object "
+ << rollback_target << " (requested snapid: ) " << snapid << dendl;
+ wait_for_missing_object(rollback_target, ctx->op);
+ } else if (ret) {
+ // ummm....huh? It *can't* return anything else at time of writing.
+ assert(0 == "unexpected error code in _rollback_to");
} else { //we got our context, let's use it to do the rollback!
hobject_t& rollback_to_sobject = rollback_to->obs.oi.soid;
if (is_degraded_object(rollback_to_sobject)) {
- dout(20) << "_rollback_to attempted to roll back to a degraded object "
+ dout(20) << "_rollback_to attempted to roll back to a degraded object "
<< rollback_to_sobject << " (requested snapid: ) " << snapid << dendl;
wait_for_degraded_object(rollback_to_sobject, ctx->op);
ret = -EAGAIN;
@@ -4292,11 +4312,12 @@ struct C_Copyfrom : public Context {
}
};
-int ReplicatedPG::start_copy(OpContext *ctx,
- hobject_t src, object_locator_t oloc, version_t version)
+int ReplicatedPG::start_copy(CopyCallback *cb, ObjectContextRef obc,
+ hobject_t src, object_locator_t oloc, version_t version,
+ const hobject_t& temp_dest_oid)
{
- const hobject_t& dest = ctx->obs->oi.soid;
- dout(10) << __func__ << " " << dest << " ctx " << ctx
+ const hobject_t& dest = obc->obs.oi.soid;
+ dout(10) << __func__ << " " << dest
<< " from " << src << " " << oloc << " v" << version
<< dendl;
@@ -4308,19 +4329,18 @@ int ReplicatedPG::start_copy(OpContext *ctx,
cancel_copy(cop);
}
- CopyOpRef cop(new CopyOp(ctx, src, oloc, version));
+ CopyOpRef cop(new CopyOp(cb, obc, src, oloc, version, temp_dest_oid));
copy_ops[dest] = cop;
- ctx->copy_op = cop;
- ++ctx->obc->copyfrom_readside;
+ ++obc->copyfrom_readside;
- _copy_some(ctx, cop);
+ _copy_some(obc, cop);
return 0;
}
-void ReplicatedPG::_copy_some(OpContext *ctx, CopyOpRef cop)
+void ReplicatedPG::_copy_some(ObjectContextRef obc, CopyOpRef cop)
{
- dout(10) << __func__ << " " << ctx << " " << cop << dendl;
+ dout(10) << __func__ << " " << obc << " " << cop << dendl;
ObjectOperation op;
if (cop->version) {
op.assert_version(cop->version);
@@ -4334,7 +4354,7 @@ void ReplicatedPG::_copy_some(OpContext *ctx, CopyOpRef cop)
&cop->data, &cop->omap,
&cop->rval);
- C_Copyfrom *fin = new C_Copyfrom(this, ctx->obs->oi.soid,
+ C_Copyfrom *fin = new C_Copyfrom(this, obc->obs.oi.soid,
get_last_peering_reset());
osd->objecter_lock.Lock();
tid_t tid = osd->objecter->read(cop->src.oid, cop->oloc, op,
@@ -4362,50 +4382,49 @@ void ReplicatedPG::process_copy_chunk(hobject_t oid, tid_t tid, int r)
<< " tid " << cop->objecter_tid << dendl;
return;
}
- OpContext *ctx = cop->ctx;
+ ObjectContextRef obc = cop->obc;
cop->objecter_tid = 0;
- if (r < 0) {
- copy_ops.erase(ctx->obc->obs.oi.soid);
- --ctx->obc->copyfrom_readside;
- kick_object_context_blocked(ctx->obc);
- reply_ctx(ctx, r);
- return;
- }
- assert(cop->rval >= 0);
- if (!cop->cursor.is_complete()) {
- // write out what we have so far
- vector<OSDOp> ops;
- tid_t rep_tid = osd->get_tid();
- osd_reqid_t reqid(osd->get_cluster_msgr_name(), 0, rep_tid);
- OpContext *tctx = new OpContext(OpRequestRef(), reqid, ops, &ctx->obc->obs, ctx->obc->ssc, this);
- tctx->mtime = ceph_clock_now(g_ceph_context);
- RepGather *repop = new_repop(tctx, ctx->obc, rep_tid);
-
- if (cop->temp_cursor.is_initial()) {
- cop->temp_coll = get_temp_coll(&tctx->local_t);
- cop->temp_oid = generate_temp_object();
- repop->ctx->new_temp_oid = cop->temp_oid;
- }
+ CopyResults results;
+ if (r >= 0) {
+ assert(cop->rval >= 0);
+
+ if (!cop->cursor.is_complete()) {
+ // write out what we have so far
+ vector<OSDOp> ops;
+ tid_t rep_tid = osd->get_tid();
+ osd_reqid_t reqid(osd->get_cluster_msgr_name(), 0, rep_tid);
+ OpContext *tctx = new OpContext(OpRequestRef(), reqid, ops, &obc->obs, obc->ssc, this);
+ tctx->mtime = ceph_clock_now(g_ceph_context);
+ RepGather *repop = new_repop(tctx, obc, rep_tid);
+
+ if (cop->temp_cursor.is_initial()) {
+ cop->temp_coll = get_temp_coll(&tctx->local_t);
+ repop->ctx->new_temp_oid = cop->temp_oid;
+ }
- _write_copy_chunk(cop, &tctx->op_t);
+ _write_copy_chunk(cop, &tctx->op_t);
- issue_repop(repop, repop->ctx->mtime);
- eval_repop(repop);
- repop->put();
+ issue_repop(repop, repop->ctx->mtime);
+ eval_repop(repop);
+ repop->put();
- dout(10) << __func__ << " fetching more" << dendl;
- _copy_some(ctx, cop);
- return;
+ dout(10) << __func__ << " fetching more" << dendl;
+ _copy_some(obc, cop);
+ return;
+ } else {
+ _build_finish_copy_transaction(cop, results.get<3>());
+ results.get<1>() = cop->temp_cursor.data_offset;
+ }
}
dout(20) << __func__ << " complete; committing" << dendl;
- execute_ctx(ctx);
+ results.get<0>() = cop->rval;
+ cop->cb->complete(results);
- copy_ops.erase(ctx->obc->obs.oi.soid);
- --ctx->obc->copyfrom_readside;
- ctx->copy_op.reset();
- kick_object_context_blocked(ctx->obc);
+ copy_ops.erase(obc->obs.oi.soid);
+ --obc->copyfrom_readside;
+ kick_object_context_blocked(obc);
}
void ReplicatedPG::_write_copy_chunk(CopyOpRef cop, ObjectStore::Transaction *t)
@@ -4432,16 +4451,12 @@ void ReplicatedPG::_write_copy_chunk(CopyOpRef cop, ObjectStore::Transaction *t)
cop->temp_cursor = cop->cursor;
}
-int ReplicatedPG::finish_copy(OpContext *ctx)
+void ReplicatedPG::_build_finish_copy_transaction(CopyOpRef cop,
+ ObjectStore::Transaction& t)
{
- CopyOpRef cop = ctx->copy_op;
- ObjectState& obs = ctx->new_obs;
- ObjectStore::Transaction& t = ctx->op_t;
+ ObjectState& obs = cop->obc->obs;
- if (!obs.exists) {
- ctx->delta_stats.num_objects++;
- obs.exists = true;
- } else {
+ if (obs.exists) {
t.remove(coll, obs.oi.soid);
}
@@ -4455,18 +4470,34 @@ int ReplicatedPG::finish_copy(OpContext *ctx)
_write_copy_chunk(cop, &t);
t.collection_move_rename(cop->temp_coll, cop->temp_oid, coll, obs.oi.soid);
pgbackend->clear_temp_obj(cop->temp_oid);
- ctx->discard_temp_oid = cop->temp_oid;
}
+}
+
+int ReplicatedPG::finish_copyfrom(OpContext *ctx)
+{
+ dout(20) << "finish_copyfrom on " << ctx->obs->oi.soid << dendl;
+ ObjectState& obs = ctx->new_obs;
+ CopyFromCallback *cb = static_cast<CopyFromCallback*>(ctx->copy_cb);
+
+ if (!ctx->obs->exists) {
+ ctx->delta_stats.num_objects++;
+ obs.exists = true;
+ }
+ if (cb->is_temp_obj_used()) {
+ ctx->discard_temp_oid = cb->temp_obj;
+ }
+ ctx->op_t.swap(cb->results.get<3>());
+ ctx->op_t.append(cb->results.get<3>());
interval_set<uint64_t> ch;
if (obs.oi.size > 0)
ch.insert(0, obs.oi.size);
ctx->modified_ranges.union_of(ch);
- if (cop->cursor.data_offset != obs.oi.size) {
+ if (cb->get_data_size() != obs.oi.size) {
ctx->delta_stats.num_bytes -= obs.oi.size;
+ obs.oi.size = cb->get_data_size();
ctx->delta_stats.num_bytes += obs.oi.size;
- obs.oi.size = cop->cursor.data_offset;
}
ctx->delta_stats.num_wr++;
ctx->delta_stats.num_wr_kb += SHIFT_ROUND_UP(obs.oi.size, 10);
@@ -4476,8 +4507,7 @@ int ReplicatedPG::finish_copy(OpContext *ctx)
void ReplicatedPG::cancel_copy(CopyOpRef cop)
{
- OpContext *ctx = cop->ctx;
- dout(10) << __func__ << " " << ctx->obc->obs.oi.soid << " ctx " << ctx
+ dout(10) << __func__ << " " << cop->obc->obs.oi.soid
<< " from " << cop->src << " " << cop->oloc << " v" << cop->version
<< dendl;
@@ -4487,13 +4517,13 @@ void ReplicatedPG::cancel_copy(CopyOpRef cop)
osd->objecter->op_cancel(cop->objecter_tid);
}
- copy_ops.erase(ctx->obc->obs.oi.soid);
- --ctx->obc->copyfrom_readside;
- ctx->copy_op.reset();
+ copy_ops.erase(cop->obc->obs.oi.soid);
+ --cop->obc->copyfrom_readside;
- kick_object_context_blocked(ctx->obc);
-
- delete ctx;
+ kick_object_context_blocked(cop->obc);
+ bool temp_obj_created = !cop->cursor.is_initial();
+ CopyResults result(-ECANCELED, 0, temp_obj_created, ObjectStore::Transaction());
+ cop->cb->complete(result);
}
void ReplicatedPG::cancel_copy_ops()
@@ -4552,10 +4582,19 @@ void ReplicatedPG::apply_repop(RepGather *repop)
if (repop->ctx->clone_obc)
repop->ctx->clone_obc->ondisk_write_lock();
+ bool unlock_snapset_obc = false;
+ if (repop->ctx->snapset_obc && repop->ctx->snapset_obc->obs.oi.soid !=
+ repop->obc->obs.oi.soid) {
+ repop->ctx->snapset_obc->ondisk_write_lock();
+ unlock_snapset_obc = true;
+ }
+
Context *oncommit = new C_OSD_OpCommit(this, repop);
Context *onapplied = new C_OSD_OpApplied(this, repop);
- Context *onapplied_sync = new C_OSD_OndiskWriteUnlock(repop->obc,
- repop->ctx->clone_obc);
+ Context *onapplied_sync = new C_OSD_OndiskWriteUnlock(
+ repop->obc,
+ repop->ctx->clone_obc,
+ unlock_snapset_obc ? repop->ctx->snapset_obc : ObjectContextRef());
int r = osd->store->queue_transactions(osr.get(), repop->tls, onapplied, oncommit, onapplied_sync, repop->ctx->op);
if (r) {
derr << "apply_repop queue_transactions returned " << r << " on " << *repop << dendl;
@@ -5145,7 +5184,7 @@ ObjectContextRef ReplicatedPG::get_object_context(const hobject_t& soid,
assert(attrs->count(OI_ATTR));
bv.push_back(attrs->find(OI_ATTR)->second);
} else {
- int r = osd->store->getattr(coll, soid, OI_ATTR, bv);
+ int r = pgbackend->objects_get_attr(soid, OI_ATTR, &bv);
if (r < 0) {
if (!can_create)
return ObjectContextRef(); // -ENOENT!
@@ -5409,12 +5448,12 @@ SnapSetContext *ReplicatedPG::get_snapset_context(
if (!attrs) {
hobject_t head(oid, key, CEPH_NOSNAP, seed,
info.pgid.pool(), nspace);
- int r = osd->store->getattr(coll, head, SS_ATTR, bv);
+ int r = pgbackend->objects_get_attr(head, SS_ATTR, &bv);
if (r < 0) {
// try _snapset
hobject_t snapdir(oid, key, CEPH_SNAPDIR, seed,
info.pgid.pool(), nspace);
- r = osd->store->getattr(coll, snapdir, SS_ATTR, bv);
+ r = pgbackend->objects_get_attr(snapdir, SS_ATTR, &bv);
if (r < 0 && !can_create)
return NULL;
}
@@ -7791,6 +7830,8 @@ int ReplicatedPG::recover_replicas(int max, ThreadPool::TPHandle &handle)
int peer = acting[i];
map<int, pg_missing_t>::const_iterator pm = peer_missing.find(peer);
assert(pm != peer_missing.end());
+ map<int, pg_info_t>::const_iterator pi = peer_info.find(peer);
+ assert(pi != peer_info.end());
size_t m_sz = pm->second.num_missing();
dout(10) << " peer osd." << peer << " missing " << m_sz << " objects." << dendl;
@@ -7804,6 +7845,15 @@ int ReplicatedPG::recover_replicas(int max, ThreadPool::TPHandle &handle)
handle.reset_tp_timeout();
const hobject_t soid(p->second);
+ if (soid > pi->second.last_backfill) {
+ if (!recovering.count(soid)) {
+ derr << __func__ << ": object added to missing set for backfill, but "
+ << "is not in recovering, error!" << dendl;
+ assert(0);
+ }
+ continue;
+ }
+
if (recovering.count(soid)) {
dout(10) << __func__ << ": already recovering" << soid << dendl;
continue;
@@ -7871,17 +7921,12 @@ int ReplicatedPG::recover_backfill(
<< " interval " << pbi.begin << "-" << pbi.end
<< " " << pbi.objects.size() << " objects" << dendl;
- int local_min = osd->store->get_ideal_list_min();
- int local_max = osd->store->get_ideal_list_max();
+ int local_min = cct->_conf->osd_backfill_scan_min;
+ int local_max = cct->_conf->osd_backfill_scan_max;
- // re-scan our local interval to cope with recent changes
- // FIXME: we could track the eversion_t when we last scanned, and invalidate
- // that way. or explicitly modify/invalidate when we actually change specific
- // objects.
- dout(10) << " rescanning local backfill_info from " << backfill_pos << dendl;
- backfill_info.clear();
- osr->flush();
- scan_range(backfill_pos, local_min, local_max, &backfill_info, handle);
+ // update our local interval to cope with recent changes
+ backfill_info.begin = backfill_pos;
+ update_range(&backfill_info, handle);
int ops = 0;
map<hobject_t, pair<eversion_t, eversion_t> > to_push;
@@ -7895,7 +7940,8 @@ int ReplicatedPG::recover_backfill(
if (backfill_info.begin <= pbi.begin &&
!backfill_info.extends_to_end() && backfill_info.empty()) {
osr->flush();
- scan_range(backfill_info.end, local_min, local_max, &backfill_info,
+ backfill_info.begin = backfill_info.end;
+ scan_range(local_min, local_max, &backfill_info,
handle);
backfill_info.trim();
}
@@ -8056,26 +8102,81 @@ void ReplicatedPG::prep_backfill_object_push(
start_recovery_op(oid);
recovering.insert(oid);
ObjectContextRef obc = get_object_context(oid, false);
+
+ // We need to take the read_lock here in order to flush in-progress writes
+ obc->ondisk_read_lock();
pgbackend->recover_object(
oid,
ObjectContextRef(),
obc,
h);
+ obc->ondisk_read_unlock();
+}
+
+void ReplicatedPG::update_range(
+ BackfillInterval *bi,
+ ThreadPool::TPHandle &handle)
+{
+ int local_min = cct->_conf->osd_backfill_scan_min;
+ int local_max = cct->_conf->osd_backfill_scan_max;
+ if (bi->version >= info.last_update) {
+ dout(10) << __func__<< ": bi is current " << dendl;
+ assert(bi->version == info.last_update);
+ } else if (bi->version >= info.log_tail) {
+ assert(!pg_log.get_log().empty());
+ dout(10) << __func__<< ": bi is old, (" << bi->version
+ << ") can be updated with log" << dendl;
+ list<pg_log_entry_t>::const_iterator i =
+ pg_log.get_log().log.end();
+ --i;
+ while (i != pg_log.get_log().log.begin() &&
+ i->version > bi->version) {
+ --i;
+ }
+ if (i->version == bi->version)
+ ++i;
+
+ assert(i != pg_log.get_log().log.end());
+ dout(10) << __func__ << ": updating from version " << i->version
+ << dendl;
+ for (; i != pg_log.get_log().log.end(); ++i) {
+ const hobject_t &soid = i->soid;
+ if (soid >= bi->begin && soid < bi->end) {
+ if (i->is_update()) {
+ dout(10) << __func__ << ": " << i->soid << " updated to version "
+ << i->version << dendl;
+ bi->objects.erase(i->soid);
+ bi->objects.insert(
+ make_pair(
+ i->soid,
+ i->version));
+ } else if (i->is_delete()) {
+ dout(10) << __func__ << ": " << i->soid << " removed" << dendl;
+ bi->objects.erase(i->soid);
+ }
+ }
+ }
+ bi->version = info.last_update;
+ } else {
+ dout(10) << __func__<< ": bi is old, rescanning local backfill_info"
+ << dendl;
+ osr->flush();
+ scan_range(local_min, local_max, &backfill_info, handle);
+ }
}
void ReplicatedPG::scan_range(
- hobject_t begin, int min, int max, BackfillInterval *bi,
+ int min, int max, BackfillInterval *bi,
ThreadPool::TPHandle &handle)
{
assert(is_locked());
- dout(10) << "scan_range from " << begin << dendl;
- bi->begin = begin;
+ dout(10) << "scan_range from " << bi->begin << dendl;
+ bi->version = info.last_update;
bi->objects.clear(); // for good measure
vector<hobject_t> ls;
ls.reserve(max);
- int r = osd->store->collection_list_partial(coll, begin, min, max,
- 0, &ls, &bi->end);
+ int r = pgbackend->objects_list_partial(bi->begin, min, max, 0, &ls, &bi->end);
assert(r >= 0);
dout(10) << " got " << ls.size() << " items, next " << bi->end << dendl;
dout(20) << ls << dendl;
@@ -8090,7 +8191,7 @@ void ReplicatedPG::scan_range(
dout(20) << " " << *p << " " << obc->obs.oi.version << dendl;
} else {
bufferlist bl;
- int r = osd->store->getattr(coll, *p, OI_ATTR, bl);
+ int r = pgbackend->objects_get_attr(*p, OI_ATTR, &bl);
assert(r >= 0);
object_info_t oi(bl);
bi->objects[*p] = oi.version;
diff --git a/src/osd/ReplicatedPG.h b/src/osd/ReplicatedPG.h
index a3d42e87600..27c9d1bb605 100644
--- a/src/osd/ReplicatedPG.h
+++ b/src/osd/ReplicatedPG.h
@@ -18,6 +18,7 @@
#define CEPH_REPLICATEDPG_H
#include <boost/optional.hpp>
+#include <boost/tuple/tuple.hpp>
#include "include/assert.h"
#include "common/cmdparse.h"
@@ -93,9 +94,11 @@ public:
* state associated with a copy operation
*/
struct OpContext;
+ class CopyCallback;
struct CopyOp {
- OpContext *ctx;
+ CopyCallback *cb;
+ ObjectContextRef obc;
hobject_t src;
object_locator_t oloc;
version_t version;
@@ -114,16 +117,86 @@ public:
hobject_t temp_oid;
object_copy_cursor_t temp_cursor;
- CopyOp(OpContext *c, hobject_t s, object_locator_t l, version_t v)
- : ctx(c), src(s), oloc(l), version(v),
+ CopyOp(CopyCallback *cb_, ObjectContextRef _obc, hobject_t s, object_locator_t l,
+ version_t v, const hobject_t& dest)
+ : cb(cb_), obc(_obc), src(s), oloc(l), version(v),
objecter_tid(0),
size(0),
- rval(-1)
+ rval(-1),
+ temp_oid(dest)
{}
};
typedef boost::shared_ptr<CopyOp> CopyOpRef;
+ /**
+ * The CopyCallback class defines an interface for completions to the
+ * copy_start code. Users of the copy infrastructure must implement
+ * one and give an instance of the class to start_copy.
+ *
+ * The implementer is responsible for making sure that the CopyCallback
+ * can associate itself with the correct copy operation. The presence
+ * of the closing Transaction ensures that write operations can be performed
+ * atomically with the copy being completed (which doing them in separate
+ * transactions would not allow); if you are doing the copy for a read
+ * op you will have to generate a separate op to finish the copy with.
+ */
+ /// return code, total object size, data in temp object?, final Transaction
+ typedef boost::tuple<int, size_t, bool, ObjectStore::Transaction> CopyResults;
+ class CopyCallback : public GenContext<CopyResults&> {
+ protected:
+ CopyCallback() {}
+ /**
+ * results.get<0>() is the return code: 0 for success; -ECANCELLED if
+ * the operation was cancelled by the local OSD; -errno for other issues.
+ * results.get<1>() is the total size of the object (for updating pg stats)
+ * results.get<2>() indicates whether we have already written data to
+ * the temp object (so it needs to get cleaned up, if the return code
+ * indicates a failure)
+ * results.get<3>() is a Transaction; if non-empty you need to perform
+ * its results before any other accesses to the object in order to
+ * complete the copy.
+ */
+ virtual void finish(CopyResults& results_) = 0;
+
+ public:
+ /// Provide the final size of the copied object to the CopyCallback
+ virtual ~CopyCallback() {};
+ };
+
+ class CopyFromCallback: public CopyCallback {
+ public:
+ CopyResults results;
+ OpContext *ctx;
+ hobject_t temp_obj;
+ CopyFromCallback(OpContext *ctx_, const hobject_t& temp_obj_) :
+ ctx(ctx_), temp_obj(temp_obj_) {}
+ ~CopyFromCallback() {}
+
+ virtual void finish(CopyResults& results_) {
+ results = results_;
+ int r = results.get<0>();
+ if (r >= 0) {
+ ctx->pg->execute_ctx(ctx);
+ }
+ ctx->copy_cb = NULL;
+ if (r < 0) {
+ if (r != -ECANCELED) { // on cancel just toss it out; client resends
+ ctx->pg->osd->reply_op_error(ctx->op, r);
+ }
+ delete ctx;
+ }
+ }
+
+ bool is_temp_obj_used() { return results.get<2>(); }
+ uint64_t get_data_size() { return results.get<1>(); }
+ int get_result() { return results.get<0>(); }
+ };
+ friend class CopyFromCallback;
+
boost::scoped_ptr<PGBackend> pgbackend;
+ PGBackend *get_pgbackend() {
+ return pgbackend.get();
+ }
/// Listener methods
void on_local_recover_start(
@@ -297,7 +370,7 @@ public:
int num_read; ///< count read ops
int num_write; ///< count update ops
- CopyOpRef copy_op;
+ CopyFromCallback *copy_cb;
hobject_t new_temp_oid, discard_temp_oid; ///< temp objects we should start/stop tracking
@@ -314,7 +387,8 @@ public:
current_osd_subop_num(0),
data_off(0), reply(NULL), pg(_pg),
num_read(0),
- num_write(0) {
+ num_write(0),
+ copy_cb(NULL) {
if (_ssc) {
new_snapset = _ssc->snapset;
snapset = &_ssc->snapset;
@@ -619,10 +693,16 @@ protected:
* @bi [out] resulting map of objects to eversion_t's
*/
void scan_range(
- hobject_t begin, int min, int max, BackfillInterval *bi,
+ int min, int max, BackfillInterval *bi,
ThreadPool::TPHandle &handle
);
+ /// Update a hash range to reflect changes since the last scan
+ void update_range(
+ BackfillInterval *bi, ///< [in,out] interval to update
+ ThreadPool::TPHandle &handle ///< [in] tp handle
+ );
+
void prep_backfill_object_push(
hobject_t oid, eversion_t v, eversion_t have, int peer,
PGBackend::RecoveryHandle *h);
@@ -662,12 +742,17 @@ protected:
}
};
struct C_OSD_OndiskWriteUnlock : public Context {
- ObjectContextRef obc, obc2;
- C_OSD_OndiskWriteUnlock(ObjectContextRef o, ObjectContextRef o2 = ObjectContextRef()) : obc(o), obc2(o2) {}
+ ObjectContextRef obc, obc2, obc3;
+ C_OSD_OndiskWriteUnlock(
+ ObjectContextRef o,
+ ObjectContextRef o2 = ObjectContextRef(),
+ ObjectContextRef o3 = ObjectContextRef()) : obc(o), obc2(o2), obc3(o3) {}
void finish(int r) {
obc->ondisk_write_unlock();
if (obc2)
obc2->ondisk_write_unlock();
+ if (obc3)
+ obc3->ondisk_write_unlock();
}
};
struct C_OSD_OndiskWriteUnlockList : public Context {
@@ -723,11 +808,15 @@ protected:
// -- copyfrom --
map<hobject_t, CopyOpRef> copy_ops;
- int start_copy(OpContext *ctx, hobject_t src, object_locator_t oloc, version_t version);
+ int start_copy(CopyCallback *cb, ObjectContextRef obc, hobject_t src,
+ object_locator_t oloc, version_t version,
+ const hobject_t& temp_dest_oid);
void process_copy_chunk(hobject_t oid, tid_t tid, int r);
void _write_copy_chunk(CopyOpRef cop, ObjectStore::Transaction *t);
- void _copy_some(OpContext *ctx, CopyOpRef cop);
- int finish_copy(OpContext *ctx);
+ void _copy_some(ObjectContextRef obc, CopyOpRef cop);
+ void _build_finish_copy_transaction(CopyOpRef cop,
+ ObjectStore::Transaction& t);
+ int finish_copyfrom(OpContext *ctx);
void cancel_copy(CopyOpRef cop);
void cancel_copy_ops();
diff --git a/src/rbd_fuse/rbd-fuse.c b/src/rbd_fuse/rbd-fuse.c
index eea6edb9eb8..2a6a8d22e81 100644
--- a/src/rbd_fuse/rbd-fuse.c
+++ b/src/rbd_fuse/rbd-fuse.c
@@ -1,7 +1,7 @@
/*
* rbd-fuse
*/
-#define FUSE_USE_VERSION 26
+#define FUSE_USE_VERSION 30
#include "include/int_types.h"
diff --git a/src/test/ObjectMap/test_store_tool/test_store_tool.cc b/src/test/ObjectMap/test_store_tool/test_store_tool.cc
index f81598ccfb8..8fcf3f30e82 100644
--- a/src/test/ObjectMap/test_store_tool/test_store_tool.cc
+++ b/src/test/ObjectMap/test_store_tool/test_store_tool.cc
@@ -24,6 +24,7 @@
#include "common/errno.h"
#include "common/safe_io.h"
#include "common/config.h"
+#include "common/strtol.h"
using namespace std;
@@ -38,7 +39,7 @@ class StoreTool
db.reset(db_ptr);
}
- void list(const string &prefix) {
+ void list(const string &prefix, const bool do_crc) {
KeyValueDB::WholeSpaceIterator iter = db->get_iterator();
if (prefix.empty())
@@ -51,7 +52,11 @@ class StoreTool
if (!prefix.empty() && (rk.first != prefix))
break;
- std::cout << rk.first << ":" << rk.second << std::endl;
+ std::cout << rk.first << ":" << rk.second;
+ if (do_crc) {
+ std::cout << " (" << iter->value().crc32c(0) << ")";
+ }
+ std::cout << std::endl;
iter->next();
}
}
@@ -79,7 +84,7 @@ class StoreTool
assert(!prefix.empty() && !key.empty());
map<string,bufferlist> result;
- set<string> keys;
+ std::set<std::string> keys;
keys.insert(key);
db->get(prefix, keys, &result);
@@ -101,6 +106,18 @@ class StoreTool
std::cout << "total: " << s << std::endl;
return s;
}
+
+ bool set(const string &prefix, const string &key, bufferlist &val) {
+ assert(!prefix.empty());
+ assert(!key.empty());
+ assert(val.length() > 0);
+
+ KeyValueDB::Transaction tx = db->get_transaction();
+ tx->set(prefix, key, val);
+ int ret = db->submit_transaction_sync(tx);
+
+ return (ret == 0);
+ }
};
void usage(const char *pname)
@@ -109,10 +126,12 @@ void usage(const char *pname)
<< "\n"
<< "Commands:\n"
<< " list [prefix]\n"
+ << " list-crc [prefix]\n"
<< " exists <prefix> [key]\n"
<< " get <prefix> <key>\n"
- << " verify <store path>\n"
+ << " crc <prefix> <key>\n"
<< " get-size\n"
+ << " set <prefix> <key> [ver <N>|in <file>]\n"
<< std::endl;
}
@@ -140,12 +159,14 @@ int main(int argc, const char *argv[])
StoreTool st(path);
- if (cmd == "list") {
+ if (cmd == "list" || cmd == "list-crc") {
string prefix;
if (argc > 3)
prefix = argv[3];
- st.list(prefix);
+ bool do_crc = (cmd == "list-crc");
+
+ st.list(prefix, do_crc);
} else if (cmd == "exists") {
string key;
@@ -183,10 +204,63 @@ int main(int argc, const char *argv[])
bl.hexdump(os);
std::cout << os.str() << std::endl;
- } else if (cmd == "verify") {
- assert(0);
+ } else if (cmd == "crc") {
+ if (argc < 5) {
+ usage(argv[0]);
+ return 1;
+ }
+ string prefix(argv[3]);
+ string key(argv[4]);
+
+ bool exists = false;
+ bufferlist bl = st.get(prefix, key, exists);
+ std::cout << "(" << prefix << ", " << key << ") ";
+ if (!exists) {
+ std::cout << " does not exist" << std::endl;
+ return 1;
+ }
+ std::cout << " crc " << bl.crc32c(0) << std::endl;
+
} else if (cmd == "get-size") {
std::cout << "estimated store size: " << st.get_size() << std::endl;
+
+ } else if (cmd == "set") {
+ if (argc < 7) {
+ usage(argv[0]);
+ return 1;
+ }
+ string prefix(argv[3]);
+ string key(argv[4]);
+ string subcmd(argv[5]);
+
+ bufferlist val;
+ string errstr;
+ if (subcmd == "ver") {
+ version_t v = (version_t) strict_strtoll(argv[6], 10, &errstr);
+ if (!errstr.empty()) {
+ std::cerr << "error reading version: " << errstr << std::endl;
+ return 1;
+ }
+ ::encode(v, val);
+ } else if (subcmd == "in") {
+ int ret = val.read_file(argv[6], &errstr);
+ if (ret < 0 || !errstr.empty()) {
+ std::cerr << "error reading file: " << errstr << std::endl;
+ return 1;
+ }
+ } else {
+ std::cerr << "unrecognized subcommand '" << subcmd << "'" << std::endl;
+ usage(argv[0]);
+ return 1;
+ }
+
+ bool ret = st.set(prefix, key, val);
+ if (!ret) {
+ std::cerr << "error setting ("
+ << prefix << "," << key << ")" << std::endl;
+ return 1;
+ }
+
} else {
std::cerr << "Unrecognized command: " << cmd << std::endl;
return 1;
diff --git a/src/test/cli-integration/rbd/formatted-output.t b/src/test/cli-integration/rbd/formatted-output.t
index bece14f11f1..707e0749367 100644
--- a/src/test/cli-integration/rbd/formatted-output.t
+++ b/src/test/cli-integration/rbd/formatted-output.t
@@ -39,7 +39,7 @@ For now, use a more inclusive regex.
$ rbd info foo
rbd image 'foo':
\tsize 1024 MB in 256 objects (esc)
- \torder 22 (4096 KB objects) (esc)
+ \torder 22 (4096 kB objects) (esc)
[^^]+ (re)
\tformat: 1 (esc)
$ rbd info foo --format json | python -mjson.tool
@@ -67,7 +67,7 @@ whenever it is run. grep -v to ignore it, but still work on other distros.
$ rbd info foo@snap
rbd image 'foo':
\tsize 1024 MB in 256 objects (esc)
- \torder 22 (4096 KB objects) (esc)
+ \torder 22 (4096 kB objects) (esc)
[^^]+ (re)
\tformat: 1 (esc)
\tprotected: False (esc)
@@ -96,7 +96,7 @@ whenever it is run. grep -v to ignore it, but still work on other distros.
$ rbd info bar
rbd image 'bar':
\tsize 1024 MB in 256 objects (esc)
- \torder 22 (4096 KB objects) (esc)
+ \torder 22 (4096 kB objects) (esc)
[^^]+ (re)
\tformat: 2 (esc)
\tfeatures: layering (esc)
@@ -131,7 +131,7 @@ whenever it is run. grep -v to ignore it, but still work on other distros.
$ rbd info bar@snap
rbd image 'bar':
\tsize 512 MB in 128 objects (esc)
- \torder 22 (4096 KB objects) (esc)
+ \torder 22 (4096 kB objects) (esc)
[^^]+ (re)
\tformat: 2 (esc)
\tfeatures: layering (esc)
@@ -169,7 +169,7 @@ whenever it is run. grep -v to ignore it, but still work on other distros.
$ rbd info bar@snap2
rbd image 'bar':
\tsize 1024 MB in 256 objects (esc)
- \torder 22 (4096 KB objects) (esc)
+ \torder 22 (4096 kB objects) (esc)
[^^]+ (re)
\tformat: 2 (esc)
\tfeatures: layering (esc)
@@ -207,7 +207,7 @@ whenever it is run. grep -v to ignore it, but still work on other distros.
$ rbd info baz
rbd image 'baz':
\tsize 2048 MB in 512 objects (esc)
- \torder 22 (4096 KB objects) (esc)
+ \torder 22 (4096 kB objects) (esc)
[^^]+ (re)
\tformat: 2 (esc)
\tfeatures: layering (esc)
@@ -241,8 +241,8 @@ whenever it is run. grep -v to ignore it, but still work on other distros.
</image>
$ rbd info quux
rbd image 'quux':
- \tsize 1024 KB in 1 objects (esc)
- \torder 22 (4096 KB objects) (esc)
+ \tsize 1024 kB in 1 objects (esc)
+ \torder 22 (4096 kB objects) (esc)
[^^]+ (re)
\tformat: 1 (esc)
$ rbd info quux --format json | python -mjson.tool
@@ -268,7 +268,7 @@ whenever it is run. grep -v to ignore it, but still work on other distros.
$ rbd info data/child
rbd image 'child':
\tsize 512 MB in 128 objects (esc)
- \torder 22 (4096 KB objects) (esc)
+ \torder 22 (4096 kB objects) (esc)
[^^]+ (re)
\tformat: 2 (esc)
\tfeatures: layering (esc)
@@ -303,7 +303,7 @@ whenever it is run. grep -v to ignore it, but still work on other distros.
$ rbd info data/child@snap
rbd image 'child':
\tsize 512 MB in 128 objects (esc)
- \torder 22 (4096 KB objects) (esc)
+ \torder 22 (4096 kB objects) (esc)
[^^]+ (re)
\tformat: 2 (esc)
\tfeatures: layering (esc)
@@ -375,7 +375,7 @@ whenever it is run. grep -v to ignore it, but still work on other distros.
NAME SIZE PARENT FMT PROT LOCK
foo 1024M 1
foo@snap 1024M 1
- quux 1024K 1 excl
+ quux 1024k 1 excl
bar 1024M 2
bar@snap 512M 2 yes
bar@snap2 1024M 2
diff --git a/src/test/common/test_bloom_filter.cc b/src/test/common/test_bloom_filter.cc
index 8e3661b2cc1..cfd41305caa 100644
--- a/src/test/common/test_bloom_filter.cc
+++ b/src/test/common/test_bloom_filter.cc
@@ -23,7 +23,17 @@ TEST(BloomFilter, Basic) {
ASSERT_TRUE(bf.contains("bar"));
}
+TEST(BloomFilter, Empty) {
+ bloom_filter bf;
+ for (int i=0; i<100; ++i) {
+ ASSERT_FALSE(bf.contains(i));
+ ASSERT_FALSE(bf.contains(stringify(i)));
+ }
+}
+
TEST(BloomFilter, Sweep) {
+ std::cout.setf(std::ios_base::fixed, std::ios_base::floatfield);
+ std::cout.precision(5);
std::cout << "# max\tfpp\tactual\tsize\tB/insert" << std::endl;
for (int ex = 3; ex < 12; ex += 2) {
for (float fpp = .001; fpp < .5; fpp *= 4.0) {
@@ -62,7 +72,9 @@ TEST(BloomFilter, Sweep) {
}
TEST(BloomFilter, SweepInt) {
- std::cout << "# max\tfpp\tactual\tsize\tB/insert" << std::endl;
+ std::cout.setf(std::ios_base::fixed, std::ios_base::floatfield);
+ std::cout.precision(5);
+ std::cout << "# max\tfpp\tactual\tsize\tB/insert\tdensity\tapprox_element_count" << std::endl;
for (int ex = 3; ex < 12; ex += 2) {
for (float fpp = .001; fpp < .5; fpp *= 4.0) {
int max = 2 << ex;
@@ -92,15 +104,70 @@ TEST(BloomFilter, SweepInt) {
double byte_per_insert = (double)bl.length() / (double)max;
- std::cout << max << "\t" << fpp << "\t" << actual << "\t" << bl.length() << "\t" << byte_per_insert << std::endl;
+ std::cout << max << "\t" << fpp << "\t" << actual << "\t" << bl.length() << "\t" << byte_per_insert
+ << "\t" << bf.density() << "\t" << bf.approx_unique_element_count() << std::endl;
ASSERT_TRUE(actual < fpp * 10);
ASSERT_TRUE(actual > fpp / 10);
+ ASSERT_TRUE(bf.density() > 0.40);
+ ASSERT_TRUE(bf.density() < 0.60);
}
}
}
+TEST(BloomFilter, CompressibleSweep) {
+ std::cout.setf(std::ios_base::fixed, std::ios_base::floatfield);
+ std::cout.precision(5);
+ std::cout << "# max\tins\test ins\tafter\ttgtfpp\tactual\tsize\tb/elem\n";
+ float fpp = .01;
+ int max = 1024;
+ for (int div = 1; div < 10; div++) {
+ compressible_bloom_filter bf(max, fpp, 1);
+ int t = max/div;
+ for (int n = 0; n < t; n++)
+ bf.insert(n);
+
+ unsigned est = bf.approx_unique_element_count();
+ if (div > 1)
+ bf.compress(1.0 / div);
+
+ for (int n = 0; n < t; n++)
+ ASSERT_TRUE(bf.contains(n));
+
+ int test = max * 100;
+ int hit = 0;
+ for (int n = 0; n < test; n++)
+ if (bf.contains(100000 + n))
+ hit++;
+
+ double actual = (double)hit / (double)test;
+
+ bufferlist bl;
+ ::encode(bf, bl);
+
+ double byte_per_insert = (double)bl.length() / (double)max;
+ unsigned est_after = bf.approx_unique_element_count();
+ std::cout << max
+ << "\t" << t
+ << "\t" << est
+ << "\t" << est_after
+ << "\t" << fpp
+ << "\t" << actual
+ << "\t" << bl.length() << "\t" << byte_per_insert
+ << std::endl;
+
+ ASSERT_TRUE(actual < fpp * 2.0);
+ ASSERT_TRUE(actual > fpp / 2.0);
+ ASSERT_TRUE(est_after < est * 2);
+ ASSERT_TRUE(est_after > est / 2);
+ }
+}
+
+
+
TEST(BloomFilter, BinSweep) {
+ std::cout.setf(std::ios_base::fixed, std::ios_base::floatfield);
+ std::cout.precision(5);
int total_max = 16384;
float total_fpp = .01;
std::cout << "total_inserts " << total_max << " target-fpp " << total_fpp << std::endl;
diff --git a/src/test/encoding/types.h b/src/test/encoding/types.h
index 3ee9e03ff7d..18ed795c3ef 100644
--- a/src/test/encoding/types.h
+++ b/src/test/encoding/types.h
@@ -6,6 +6,7 @@ TYPE(filepath)
#include "common/bloom_filter.hpp"
TYPE(bloom_filter)
+TYPE(compressible_bloom_filter)
#include "common/snap_types.h"
TYPE(SnapContext)
diff --git a/src/test/osd/RadosModel.h b/src/test/osd/RadosModel.h
index a87ecebb4c1..aba6a531c6f 100644
--- a/src/test/osd/RadosModel.h
+++ b/src/test/osd/RadosModel.h
@@ -143,6 +143,7 @@ public:
map<int, map<string,ObjectDesc> > pool_obj_cont;
set<string> oid_in_use;
set<string> oid_not_in_use;
+ set<int> snaps_in_use;
int current_snap;
string pool_name;
librados::IoCtx io_ctx;
@@ -1043,6 +1044,7 @@ public:
if (!(err == -ENOENT && old_value.deleted())) {
cerr << num << ": Error: oid " << oid << " read returned error code "
<< err << std::endl;
+ context->errors++;
}
} else {
cout << num << ": expect " << old_value.most_recent() << std::endl;
@@ -1314,6 +1316,8 @@ public:
}
context->oid_in_use.insert(oid);
context->oid_not_in_use.erase(oid);
+ context->snaps_in_use.insert(roll_back_to);
+
context->roll_back(oid, roll_back_to);
uint64_t snap = context->snaps[roll_back_to];
@@ -1341,6 +1345,7 @@ public:
context->update_object_version(oid, comp->get_version64());
context->oid_in_use.erase(oid);
context->oid_not_in_use.insert(oid);
+ context->snaps_in_use.erase(roll_back_to);
context->kick();
}
diff --git a/src/test/osd/TestRados.cc b/src/test/osd/TestRados.cc
index be919161579..7158f50a74a 100644
--- a/src/test/osd/TestRados.cc
+++ b/src/test/osd/TestRados.cc
@@ -120,13 +120,16 @@ private:
}
case TEST_OP_ROLLBACK:
- if (context.snaps.empty()) {
+ if (context.snaps.size() <= context.snaps_in_use.size()) {
return NULL;
- } else {
+ }
+ while (true) {
int snap = rand_choose(context.snaps)->first;
+ if (context.snaps_in_use.count(snap))
+ continue; // in use; try again!
string oid = *(rand_choose(context.oid_not_in_use));
cout << "rollback oid " << oid << " to " << snap << std::endl;
- return new RollbackOp(m_op, &context, oid, snap);
+ return new RollbackOp(m_op, &context, oid, snap);
}
case TEST_OP_SETATTR: