diff options
60 files changed, 1695 insertions, 234 deletions
diff --git a/PendingReleaseNotes b/PendingReleaseNotes index a9880942b5a..ec184746c58 100644 --- a/PendingReleaseNotes +++ b/PendingReleaseNotes @@ -19,3 +19,15 @@ v0.67 commandline tool. ceph_rest_api.py can be used as a WSGI application for deployment in a more-capable web server. See ceph-rest-api.8 for more. + +* The radosgw caps were inconsistently documented to be either 'mon = + allow r' or 'mon = allow rw'. The 'mon = allow rw' is required for + radosgw to create its own pools. All documentation has been updated + accordingly. + +* rgw copy object operation may return extra progress info during the + operation. At this point it will only happen when doing cross zone + copy operations. The S3 response will now return extra <Progress> + field under the <CopyResult> container. The Swift response will + now send the progress as a json array. + diff --git a/configure.ac b/configure.ac index 09fd81d722d..415da311712 100644 --- a/configure.ac +++ b/configure.ac @@ -250,7 +250,10 @@ AS_IF([test "x$with_fuse" != xno], [Define if you have fuse]) HAVE_LIBFUSE=1 # look for fuse_getgroups and define FUSE_GETGROUPS if found + LIBS_saved="$LIBS" + LIBS="$LIBS -lfuse" AC_CHECK_FUNCS([fuse_getgroups]) + LIBS="$LIBS_saved" ], [AC_MSG_FAILURE( [no FUSE found (use --without-fuse to disable)])])]) diff --git a/doc/changelog/v0.61.5.txt b/doc/changelog/v0.61.5.txt new file mode 100644 index 00000000000..9d99f910ca4 --- /dev/null +++ b/doc/changelog/v0.61.5.txt @@ -0,0 +1,1199 @@ +commit 8ee10dc4bb73bdd918873f29c70eedc3c7ef1979 +Author: Gary Lowell <gary.lowell@inktank.com> +Date: Wed Jul 17 16:39:08 2013 -0700 + + v0.61.5 + +commit 39bffac6b6c898882d03de392f7f2218933d942b +Author: Sage Weil <sage@inktank.com> +Date: Tue Jul 16 13:14:50 2013 -0700 + + ceph-disk: rely on /dev/disk/by-partuuid instead of special-casing journal symlinks + + This was necessary when ceph-disk-udev didn't create the by-partuuid (and + other) symlinks for us, but now it is fragile and error-prone. (It also + appears to be broken on a certain customer RHEL VM.) See + d7f7d613512fe39ec883e11d201793c75ee05db1. + + Instead, just use the by-partuuid symlinks that we spent all that ugly + effort generating. + + Backport: cuttlefish + Signed-off-by: Sage Weil <sage@inktank.com> + Reviewed-by: Dan Mick <dan.mick@inktank.com> + (cherry picked from commit 64379e701b3ed862c05f156539506d3382f77aa8) + +commit 363d54288254b5e2311cd28fce5988d68cfd5773 +Author: Joao Eduardo Luis <joao.luis@inktank.com> +Date: Tue Jul 16 16:49:48 2013 +0100 + + mon: Monitor: StoreConverter: clearer debug message on 'needs_conversion()' + + The previous debug message outputted the function's name, as often our + functions do. This was however a source of bewilderment, as users would + see those in logs and think their stores would need conversion. Changing + this message is trivial enough and it will make ceph users happier log + readers. + + Backport: cuttlefish + Signed-off-by: Joao Eduardo Luis <joao.luis@inktank.com> + Reviewed-by: Sage Weil <sage@inktank.com> + (cherry picked from commit ad1392f68170b391d11df0ce5523c2d1fb57f60e) + +commit 0ea89760def73f76d8100889eca3c25b0a6eb772 +Author: Joao Eduardo Luis <joao.luis@inktank.com> +Date: Tue Jul 16 16:45:39 2013 +0100 + + mon: Monitor: do not reopen MonitorDBStore during conversion + + We already open the store on ceph_mon.cc, before we start the conversion. + Given we are unable to reproduce this every time a conversion is triggered, + we are led to believe that this causes a race in leveldb that will lead + to 'store.db/LOCK' being locked upon the open this patch removes. + + Regardless, reopening the db here is pointless as we already did it when + we reach Monitor::StoreConverter::convert(). + + Fixes: #5640 + Backport: cuttlefish + + Signed-off-by: Joao Eduardo Luis <joao.luis@inktank.com> + Reviewed-by: Sage Weil <sage@inktank.com> + (cherry picked from commit 036e6739a4e873863bae3d7d00f310c015dfcdb3) + +commit 595c09df9134fb0d62144fe1594914c90e567dca +Author: Sage Weil <sage@inktank.com> +Date: Sun Jul 14 15:54:29 2013 -0700 + + messages/MClientReconnect: clear data when encoding + + The MClientReconnect puts everything in the data payload portion of + the message and nothing in the front portion. That means that if the + message is resent (socket failure or something), the messenger thinks it + hasn't been encoded yet (front empty) and reencodes, which means + everything gets added (again) to the data portion. + + Decoding keep decoding until it runs out of data, so the second copy + means we decode garbage snap realms, leading to the crash in bug + + Clearing data each time around resolves the problem, although it does + mean we do the encoding work multiple times. We could alternatively + (or also) stick some data in the front portion of the payload + (ignored), but that changes the wire protocol and I would rather not + do that. + + Fixes: #4565 + Backport: cuttlefish + Signed-off-by: Sage Weil <sage@inktank.com> + Reviewed-by: Greg Farnum <greg@inktank.com> + (cherry picked from commit 314cf046b0b787ca69665e8751eab6fe7adb4037) + +commit 8c178e0d39d8d4a4820eb061f79d74f95e60199f +Author: Sage Weil <sage@inktank.com> +Date: Mon Jul 15 10:05:37 2013 -0700 + + mon: once sync full is chosen, make sure we don't change our mind + + It is possible for a sequence like: + + - probe + - first probe reply has paxos trim that indicates a full sync is + needed + - start sync + - clear store + - something happens that makes us abort and bootstrap (e.g., the + provider mon restarts + - probe + - first probe reply has older paxos trim bound and we call an election + - on election completion, we crash because we have no data. + + Non-determinism of the probe decision aside, we need to ensure that + the info we share during probe (fc, lc) is accurate, and that once we + clear the store we know we *must* do a full sync. + + This is a backport of aa60f940ec1994a61624345586dc70d261688456. + + Fixes: #5621 + Signed-off-by: Sage Weil <sage@inktank.com> + Reviewed-by: Greg Farnum <greg@inktank.com> + +commit 6af0ed9bc4cc955f8c30ad9dc6e9095599f323d0 +Author: Sage Weil <sage@inktank.com> +Date: Tue Jul 9 14:12:15 2013 -0700 + + mon: do not scrub if scrub is in progress + + This prevents an assert from unexpected scrub results from the previous + scrub on the leader. + + Backport: cuttlefish + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit 00ae543b3e32f89d906a0e934792cc5309f57696) + +commit 5642155ba5ca9b384a7af058a6538ac00c2a592d +Author: Sage Weil <sage@inktank.com> +Date: Wed Jul 10 10:06:20 2013 -0700 + + messages/MPGStats: do not set paxos version to osdmap epoch + + The PaxosServiceMessage version field is meant for client-coordinated + ordering of messages when switching between monitors (and is rarely + used). Do not fill it with the osdmap epoch lest it be compared to a + pgmap version, which may cause the mon to (near) indefinitely put it on + a wait queue until the pgmap version catches up. + + Backport: cuttlefish + Signed-off-by: Sage Weil <sage@inktank.com> + Reviewed-by: Greg Farnum <greg@inktank.com> + (cherry picked from commit b36338be43f43b6dd4ee87c97f2eaa23b467c386) + +commit 06c65988bb0b1d1ec626fe31e9d806a1c4e24b28 +Author: Sage Weil <sage@inktank.com> +Date: Thu Jul 11 18:43:24 2013 -0700 + + osd/OSDmap: fix OSDMap::Incremental::dump() for new pool names + + The name is always present when pools are created, but not when they are + modified. Also, a name may be present with a new_pools entry if the pool + is just renamed. Separate it out completely in the dump. + + Backport: cuttlefish, bobtail + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit 3e4a29111e89588385e63f8d92ce3d67739dd679) + +commit 658240710baaf9c661b8fbf856322907a0d394ee +Author: Sage Weil <sage@inktank.com> +Date: Mon Jul 8 10:49:28 2013 -0700 + + mon/PaxosService: prevent reads until initial service commit is done + + Do not process reads (or, by PaxosService::dispatch() implication, writes) + until we have committed the initial service state. This avoids things like + EPERM due to missing keys when we race with mon creation, triggered by + teuthology tests doing their health check after startup. + + Fixes: #5515 + Backport: cuttlefish + Signed-off-by: Sage Weil <sage@inktank.com> + Reviewed-by: Joao Eduardo Luis <joao.luis@inktank.com> + (cherry picked from commit d08b6d6df7dba06dad73bdec2c945f24afc02717) + +commit 5c3ff33771e227b3fb5cc354323846fe8db4ecc1 +Author: Sage Weil <sage@inktank.com> +Date: Fri Jun 28 12:21:58 2013 -0700 + + client: send all request put's through put_request() + + Make sure all MetaRequest reference put's go through the same path that + releases inode references, including all of the error paths. + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit 87217e1e3cb2785b79d0dec49bd3f23a827551f5) + +commit 1df78ad73df581bc7537688ae28bda820b089a13 +Author: Sage Weil <sage@inktank.com> +Date: Fri Jun 28 11:50:11 2013 -0700 + + client: fix remaining Inode::put() caller, and make method psuedo-private + + Not sure I can make this actually private and make Client::put_inode() a + friend method (making all of Client a friend would defeat the purpose). + This works well enough, though! + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit 9af3b86b25574e4d2cdfd43e61028cffa19bdeb1) + +commit fea024cc3dd2c6fd9ff322d1cd15e0d75c92eca5 +Author: Sage Weil <sage@inktank.com> +Date: Thu Jun 27 21:39:35 2013 -0700 + + client: use put_inode on MetaRequest inode refs + + When we drop the request inode refs, we need to use put_inode() to ensure + they get cleaned up properly (removed from inode_map, caps released, etc.). + Do this explicitly here (as we do with all other inode put() paths that + matter). + + Fixes: #5381 + Backport: cuttlefish + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit 81bee6487fb1ce9e090b030d61bda128a3cf4982) + +commit 62ae39ec8f208cb8f89e43ba844b9a20b4315c61 +Author: Sage Weil <sage@inktank.com> +Date: Mon Jul 8 15:57:48 2013 -0700 + + mon: be smarter about calculating last_epoch_clean lower bound + + We need to take PGs whose mapping has not changed in a long time into + account. For them, the pg state will indicate it was clean at the time of + the report, in which case we can use that as a lower-bound on their actual + latest epoch clean. If they are not currently clean (at report time), use + the last_epoch_clean value. + + Fixes: #5519 + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit cc0006deee3153e06ddd220bf8a40358ba830135) + +commit da725852190245d2f91b7b21e72baee70e4342bd +Author: Sage Weil <sage@inktank.com> +Date: Mon Jul 8 13:27:58 2013 -0700 + + osd: report pg stats to mon at least every N (=500) epochs + + The mon needs a moderately accurate last_epoch_clean value in order to trim + old osdmaps. To prevent a PG that hasn't peered or received IO in forever + from preventing this, send pg stats at some minimum frequency. This will + increase the pg stat report workload for the mon over an idle pool, but + should be no worse that a cluster that is getting actual IO and sees these + updates from normal stat updates. + + This makes the reported update a bit more aggressive/useful in that the epoch + is the last map epoch processed by this PG and not just one that is >= the + currenting interval. Note that the semantics of this field are pretty useless + at this point. + + See #5519 + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit da81228cc73c95737f26c630e5c3eccf6ae1aaec) + +commit 757af91b2af0da6bbfeeb53551fa1ef4ef9118ea +Author: Sage Weil <sage@inktank.com> +Date: Wed Jul 10 11:32:34 2013 -0700 + + osd: fix warning + + From 653e04a79430317e275dd77a46c2b17c788b860b + + Backport: cuttlefish, bobtail + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit bc291d3fc3fc1cac838565cbe0f25f71d855a6e3) + +commit 65af2538329472d2fd078bb961863c40cdabda12 +Merge: e537699 804314b +Author: Sage Weil <sage@inktank.com> +Date: Fri Jul 12 15:21:20 2013 -0700 + + Merge remote-tracking branch 'gh/wip-mon-sync-2' into cuttlefish + + Reviewed-by: Joao Eduardo Luis <joao.luis@inktank.com> + Reviewed-by: Greg Farnum <greg@inktank.com> + +commit e537699b33f84c14f027b56372fbcb0a99bbe88d +Author: Sandon Van Ness <sandon@inktank.com> +Date: Wed Jul 10 14:55:52 2013 -0700 + + Get device-by-path by looking for it instead of assuming 3rd entry. + + On some systems (virtual machines so far) the device-by-path entry + from udevadm is not always in the same spot so instead actually + look for the right output instead of blindy assuming that its a + specific field in the output. + + Signed-off-by: Sandon Van Ness <sandon@inktank.com> + Reviewed-by: Gary Lowell <gary.lowell@inktank.com> + +commit 804314b8bfa5ec75cc9653e2928874c457395c92 +Merge: 6ad9fe1 78f2266 +Author: Sage Weil <sage@inktank.com> +Date: Wed Jul 10 11:40:37 2013 -0700 + + Merge remote-tracking branch 'gh/cuttlefish' into wip-mon-sync-2 + +commit 78f226634bd80f6678b1f74ccf785bc52fcd6b62 +Author: Sage Weil <sage@inktank.com> +Date: Wed Jul 10 11:02:08 2013 -0700 + + osd: limit number of inc osdmaps send to peers, clients + + We should not send an unbounded number of inc maps to our peers or clients. + In particular, if a peer is not contacted for a while, we may think they + have a very old map (say, 10000 epochs ago) and send thousands of inc maps + when the distribution shifts and we need to peer. + + Note that if we do not send enough maps, the peers will make do by + requesting the map from somewhere else (currently the mon). Regardless + of the source, however, we must limit the amount that we speculatively + share as it usually is not needed. + + Backport: cuttlefish, bobtail + Signed-off-by: Sage Weil <sage@inktank.com> + Reviewed-by: Samuel Just <sam.just@inktank.com> + (cherry picked from commit 653e04a79430317e275dd77a46c2b17c788b860b) + +commit 54ee2dc80ed032c286546da51442340ec9991cdf +Author: Christophe Courtaut <christophe.courtaut@gmail.com> +Date: Mon Jul 1 14:57:17 2013 +0200 + + rgw: Fix return value for swift user not found + + http://tracker.ceph.com/issues/1779 fixes #1779 + + Adjust the return value from rgw_get_user_info_by_swift call + in RGW_SWIFT_Auth_Get::execute() to have the correct + return code in response. + (cherry picked from commit 4089001de1f22d6acd0b9f09996b71c716235551) + +commit 47852c263831707fff1570317a7446b0700c5962 +Author: Sage Weil <sage@inktank.com> +Date: Tue Jul 9 21:55:51 2013 -0700 + + mon/OSDMonitor: make 'osd crush rm ...' slightly more idempotent + + This is a manual backport of 18a624fd8b90d9959de51f07622cf0839e6bd9aa. + Do not return immediately if we are looking at uncommitted state.t + + Signed-off-by: Sage Weil <sage@inktank.com> + +commit bfc26c656d183fbcc90a352391e47f9f51c96052 +Author: Sage Weil <sage@inktank.com> +Date: Mon Jul 8 17:46:40 2013 -0700 + + mon/OSDMonitor: fix base case for loading full osdmap + + Right after cluster creation, first_committed is 1 and latest stashed in 0, + but we don't have the initial full map yet. Thereafter, we do (because we + write it with trim). Fixes afd6c7d8247075003e5be439ad59976c3d123218. + + Signed-off-by: Sage Weil <sage@inktank.com> + Reviewed-by: Joao Eduardo Luis <joao.luis@inktank.com> + (cherry picked from commit 43fa7aabf1f7e5deb844c1f52d451bab9e7d1006) + +commit 7fb3804fb860dcd0340dd3f7c39eec4315f8e4b6 +Author: Sage Weil <sage@inktank.com> +Date: Mon Jul 8 15:04:59 2013 -0700 + + mon: fix osdmap stash, trim to retain complete history of full maps + + The current interaction between sync and stashing full osdmaps only on + active mons means that a sync can result in an incomplete osdmap_full + history: + + - mon.c starts a full sync + - during sync, active osdmap service should_stash_full() is true and + includes a full in the txn + - mon.c sync finishes + - mon.c update_from_paxos gets "latest" stashed that it got from the + paxos txn + - mon.c does *not* walk to previous inc maps to complete it's collection + of full maps. + + To fix this, we disable the periodic/random stash of full maps by the + osdmap service. + + This introduces a new problem: we must have at least one full map (the first + one) in order for a mon that just synced to build it's full collection. + Extend the encode_trim() process to allow the osdmap service to include + the oldest full map with the trim txn. This is more complex than just + writing the full maps in the txn, but cheaper--we only write the full + map at trim time. + + This *might* be related to previous bugs where the full osdmap was + missing, or case where leveldb keys seemed to 'disappear'. + + Fixes: #5512 + Backport: cuttlefish + Signed-off-by: Sage Weil <sage@inktank.com> + Reviewed-by: Greg Farnum <greg@inktank.com> + (cherry picked from commit afd6c7d8247075003e5be439ad59976c3d123218) + +commit 24f90b832c695ef13021db66a178c18369ac356d +Author: Sage Weil <sage@inktank.com> +Date: Mon Jul 8 15:07:57 2013 -0700 + + mon: implement simple 'scrub' command + + Compare all keys within the sync'ed prefixes across members of the quorum + and compare the key counts and CRC for inconsistencies. + + Currently this is a one-shot inefficient hammer. We'll want to make this + work in chunks before it is usable in production environments. + + Protect with a feature bit to avoid sending MMonScrub to mons who can't + decode it. + + Signed-off-by: Sage Weil <sage@inktank.com> + Reviewed-by: Greg Farnum <greg@inktank.com> + (cherry picked from commit a9906641a1dce150203b72682da05651e4d68ff5) + + Conflicts: + + src/mon/MonCommands.h + src/mon/Monitor.cc + +commit 926f723c12428a034545c6c4ff6641e1d5e05d24 +Author: Samuel Just <sam.just@inktank.com> +Date: Wed Jul 3 11:18:33 2013 -0700 + + Elector.h: features are 64 bit + + Fixes: #5497 + Signed-off-by: Samuel Just <sam.just@inktank.com> + Reviewed-by: Sage Weil <sage@inktank.com> + Reviewed-by: Joao Luis <joao.luis@inktank.com> + (cherry picked from commit 3564e304e3f50642e4d9ff25e529d5fc60629093) + +commit c2b38291e706c9d1d4d337cee3a944f34bf66525 +Author: Samuel Just <sam.just@inktank.com> +Date: Wed Jul 3 11:18:19 2013 -0700 + + ceph_features.h: declare all features as ULL + + Otherwise, the first 32 get |'d together as ints. Then, the result + ((int)-1) is sign extended to ((long long int)-1) before being |'d + with the 1LL entries. This results in ~((uint64_t)0). + + Fixes: #5497 + Signed-off-by: Samuel Just <sam.just@inktank.com> + Reviewed-by: Sage Weil <sage@inktank.com> + Reviewed-by: Joao Luis <joao.luis@inktank.com> + (cherry picked from commit 4255b5c2fb54ae40c53284b3ab700fdfc7e61748) + +commit 95ef961d8537fc369efd0634262ffb8f288d6e9e +Author: Samuel Just <sam.just@inktank.com> +Date: Tue Jul 2 21:09:36 2013 -0700 + + Pipe: use uint64_t not unsigned when setting features + + Fixes: #5497 + Signed-off-by: Samuel Just <sam.just@inktank.com> + Reviewed-by: Sage Weil <sage@inktank.com> + Reviewed-by: Joao Luis <joao.luis@inktank.com> + (cherry picked from commit bc3e2f09f8860555d8b3b49b2eea164b4118d817) + +commit 09d258b70a28e5cea555b9d7e215fe41d6b84577 +Author: Sage Weil <sage@inktank.com> +Date: Mon Jul 8 11:24:48 2013 -0700 + + client: remove O_LAZY + + The once-upon-a-time unique O_LAZY value I chose forever ago is now + O_NOATIME, which means that some clients are choosing relaxed + consistency without meaning to. + + It is highly unlikely that a real O_LAZY will ever exist, and we can + select it in the ceph case with the ioctl or libcephfs call, so drop + any support for doing this via open(2) flags. + + Update doc/lazy_posix.txt file re: lazy io. + + Backport: cuttlefish + Signed-off-by: Sage Weil <sage@inktank.com> + Reviewed-by: Greg Farnum <greg@inktank.com> + (cherry picked from commit 94afedf02d07ad4678222aa66289a74b87768810) + +commit c3b684932bad31fc853ad556d16e1e4a9926486e +Author: Sage Weil <sage@inktank.com> +Date: Mon Jul 8 12:55:20 2013 -0700 + + osd/osd_types: fix pg_stat_t::dump for last_epoch_clean + + Backport: cuttlefish + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit 69a55445439fce0dd6a3d32ff4bf436da42f1b11) + +commit a02f2510fcc800b9f2cf2a06401a7b97d5985409 +Author: Sage Weil <sage@inktank.com> +Date: Fri Jul 5 16:03:49 2013 -0700 + + mon: remove bad assert about monmap version + + It is possible to start a sync when our newest monmap is 0. Usually we see + e0 from probe, but that isn't always published as part of the very first + paxos transaction due to the way PaxosService::_active generates it's + first initial commit. + + In any case, having e0 here is harmless. + + Fixes: #5509 + Signed-off-by: Sage Weil <sage@inktank.com> + Reviewed-by: Joao Eduardo Luis <joao.luis@inktank.com> + (cherry picked from commit 85a1d6cc5d3852c94d1287b566656c5b5024fa13) + +commit 6ad9fe17a674ba65bbeb4052cb1ac47f3113e7bf +Author: Sage Weil <sage@inktank.com> +Date: Thu Jul 4 19:33:06 2013 -0700 + + mon/Paxos: fix sync restart + + If we have a sync going, and an election intervenes, the client will + try to continue by sending a new start_chunks request. In order to + ensure that we get all of the paxos commits from our original starting + point (and thus properly update the keys from which they started), + only pay attention if they *also* send their current last_committed + version. Otherwise, start them at the beginning. + + Signed-off-by: Sage Weil <sage@inktank.com> + +commit c5812b1c893305a7d20f9eaec2695c8b1691f0c9 +Author: Sage Weil <sage@inktank.com> +Date: Thu Jul 4 14:57:06 2013 -0700 + + mon: uninline _trim_enable and Paxos::trim_{enable,disable} so we can debug them + + Signed-off-by: Sage Weil <sage@inktank.com> + +commit 6fbcbd7fddf35a5be4b38e536871903bff4f9bf1 +Author: Sage Weil <sage@inktank.com> +Date: Thu Jul 4 14:55:34 2013 -0700 + + mon/Paxos: increase paxos max join drift + + A value of 10 is too aggressive for large, long-running syncs. 100 is + about 2 minutes of activity at most, which should be a more forgiving + buffer. + + Signed-off-by: Sage Weil <sage@inktank.com> + +commit f3a51fa30e5ce1656853b40d831409f195f6e4ca +Author: Sage Weil <sage@inktank.com> +Date: Thu Jul 4 14:21:04 2013 -0700 + + mon/Paxos: configure minimum paxos txns separately + + We were using paxos_max_join_drift to control the minimum number of + paxos transactions to keep around. Instead, make this explicit, and + separate from the join drift. + + Signed-off-by: Sage Weil <sage@inktank.com> + +commit 1156721f22f5f337241eef3d0276ca74fe6352d1 +Author: Sage Weil <sage@inktank.com> +Date: Thu Jul 4 17:09:07 2013 -0700 + + mon: include any new paxos commits in each sync CHUNK message + + We already take note of the paxos version when we begin the sync. As + sync progresses and there are new paxos commits/txns, include those + and update last_committed, so that when sync completes we will have + a full view of everything that happened during sync. + + Note that this does not introduce any compatibility change. This change + *only* affects the provider. The key difference is that at the end + of the sync, the provide will set version to the latest version, and + not the version from the start of the sync (as was done previously). + + Signed-off-by: Sage Weil <sage@inktank.com> + +commit 40672219a081f0dc2dd536977290ef05cfc9f097 +Author: Sage Weil <sage@inktank.com> +Date: Thu Jul 4 12:17:28 2013 -0700 + + mon/MonitorDBStore: expose get_chunk_tx() + + Allow users get the transaction unencoded. + + Signed-off-by: Sage Weil <sage@inktank.com> + +commit db2bb270e93ed44f9252d65d1d4c9b36875d0ea5 +Author: Sage Weil <sage@inktank.com> +Date: Wed Jul 3 17:15:56 2013 -0700 + + mon: enable leveldb cache by default + + 256 is not as large as the upstream 512 MB, but will help signficiantly and + be less disruptive for existing cuttlefish clusters. + + Sort-of backport of e93730b7ffa48b53c8da2f439a60cb6805facf5a. + + Signed-off-by: Sage Weil <sage@inktank.com> + +commit 123f676e3ae8154ca94cb076c4c4ec5389d2a643 +Author: Sage Weil <sage@inktank.com> +Date: Wed Jul 3 16:56:06 2013 -0700 + + mon/Paxos: make 'paxos trim disabled max versions' much much larger + + 108000 is about 3 hours if paxos is going full-bore (1 proposal/second). + That ought to be pretty safe. Otherwise, we start trimming to soon and a + slow sync will just have to restart when it finishes. + + Backport: cuttlefish + Signed-off-by: Sage Weil <sage@inktank.com> + Reviewed-by: Joao Eduardo Luis <joao.luis@inktank.com> + (cherry picked from commit 71ebfe7e1abe4795b46cf00dfe1b03d1893368b0) + + Conflicts: + + src/common/config_opts.h + +commit 03393c0df9f54e4f1db60e1058ca5a7cd89f44e6 +Author: Sage Weil <sage@inktank.com> +Date: Wed Jun 26 06:01:40 2013 -0700 + + mon: do not reopen MonitorDBStore during startup + + level doesn't seem to like this when it races with an internal compaction + attempt (see below). Instead, let the store get opened by the ceph_mon + caller, and pull a bit of the logic into the caller to make the flow a + little easier to follow. + + -2> 2013-06-25 17:49:25.184490 7f4d439f8780 10 needs_conversion + -1> 2013-06-25 17:49:25.184495 7f4d4065c700 5 asok(0x13b1460) entry start + 0> 2013-06-25 17:49:25.316908 7f4d3fe5b700 -1 *** Caught signal (Segmentation fault) ** + in thread 7f4d3fe5b700 + + ceph version 0.64-667-g089cba8 (089cba8fc0e8ae8aef9a3111cba7342ecd0f8314) + 1: ceph-mon() [0x649f0a] + 2: (()+0xfcb0) [0x7f4d435dccb0] + 3: (leveldb::Table::BlockReader(void*, leveldb::ReadOptions const&, leveldb::Slice const&)+0x154) [0x806e54] + 4: ceph-mon() [0x808840] + 5: ceph-mon() [0x808b39] + 6: ceph-mon() [0x806540] + 7: (leveldb::DBImpl::DoCompactionWork(leveldb::DBImpl::CompactionState*)+0xdd) [0x7f363d] + 8: (leveldb::DBImpl::BackgroundCompaction()+0x2c0) [0x7f4210] + 9: (leveldb::DBImpl::BackgroundCall()+0x68) [0x7f4cc8] + 10: ceph-mon() [0x80b3af] + 11: (()+0x7e9a) [0x7f4d435d4e9a] + 12: (clone()+0x6d) [0x7f4d4196bccd] + NOTE: a copy of the executable, or `objdump -rdS <executable>` is needed to interpret this. + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit ea1f316e5de21487ae034a1aa929068ba23ac525) + +commit 0143acc49bc5834836afc2c5a9d8f67030bec85f +Author: Sage Weil <sage@inktank.com> +Date: Tue Jul 2 14:43:17 2013 -0700 + + sysvinit, upstart: handle symlinks to dirs in /var/lib/ceph/* + + Match a symlink to a dir, not just dirs. This fixes the osd case of e.g., + creating an osd in /data/osd$id in which ceph-disk makes a symlink from + /var/lib/ceph/osd/ceph-$id. + + Fix proposed by Matt Thompson <matt.thompson@mandiant.com>; extended to + include the upstart users too. + + Fixes: #5490 + Signed-off-by: Sage Weil <sage@inktank.com> + Reviewed-by: Dan Mick <dan.mick@inktank.com> + (cherry picked from commit 87c98e92d1375c8bc76196bbbf06f677bef95e64) + +commit 7e878bcc8c1b51538f3c05f854a9dac74c09b116 +Author: Sage Weil <sage@inktank.com> +Date: Mon Jul 1 17:33:11 2013 -0700 + + rgw: add RGWFormatter_Plain allocation to sidestep cranky strlen() + + Valgrind complains about an invalid read when we don't pad the allocation, + and because it is inlined we can't whitelist it for valgrind. Workaround + the warning by just padding our allocations a bit. + + Fixes: #5346 + Backport: cuttlefish + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit 49ff63b1750789070a8c6fef830c9526ae0f6d9f) + +commit ca61402855966210ba1598239eaf454eaad0f5f2 +Author: Yan, Zheng <zheng.z.yan@intel.com> +Date: Wed May 15 11:24:36 2013 +0800 + + mds: warn on unconnected snap realms + + When there are more than one active MDS, restarting MDS triggers + assertion "reconnected_snaprealms.empty()" quite often. If there + is no snapshot in the FS, the items left in reconnected_snaprealms + should be other MDS' mdsdir. I think it's harmless. + + If there are snapshots in the FS, the assertion probably can catch + real bugs. But at present, snapshot feature is broken, fixing it is + non-trivial. So replace the assertion with a warning. + + Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com> + (cherry picked from commit 26effc0e583b0a3dade6ec81ef26dec1c94ac8b2) + +commit e11f258831e14dc3755e09c0fd4f9bfdf79022a7 +Author: Sage Weil <sage@inktank.com> +Date: Wed Jun 26 06:53:08 2013 -0700 + + mon/PGMonitor: use post_paxos_update, not init, to refresh from osdmap + + We do two things here: + - make init an one-time unconditional init method, which is what the + health service expects/needs. + - switch PGMonitor::init to be post_paxos_update() which is called after + the other services update, which is what PGMonitor really needs. + + This is a new version of the fix originally in commit + a2fe0137946541e7b3b537698e1865fbce974ca6 (and those around it). That is, + this re-fixes a problem where osds do not see pg creates from their + subscribe due to map_pg_creates() not getting called. + + Backport: cuttlefish + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit e635c47851d185eda557e36bdc4bf3775f7b87a2) + + Conflicts: + src/mon/PGMonitor.cc + src/mon/PGMonitor.h + +commit 4d07fb014178da3c88edeb8765e1aaacb8cb8ffa +Author: Sage Weil <sage@inktank.com> +Date: Wed Jun 26 06:52:01 2013 -0700 + + mon/PaxosService: add post_paxos_update() hook + + Some services need to update internal state based on other service's + state, and thus need to be run after everyone has pulled their info out of + paxos. + + Backport: cuttlefish + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit 131686980f0a930d5de7cbce8234fead5bd438b6) + +commit 90f5c448abeb127ae5a5528a79bd7bdbc74cb497 +Author: Greg Farnum <greg@inktank.com> +Date: Thu Jun 27 14:58:14 2013 -0700 + + ceph-disk: s/else if/elif/ + + Signed-off-by: Greg Farnum <greg@inktank.com> + Reviewed-by: Joao Luis <joao.luis@inktank.com> + (cherry picked from commit bd8255a750de08c1b8ee5e9c9a0a1b9b16171462) + (cherry picked from commit 9e604ee6943fdb131978afbec51321050faddfc6) + +commit 5c4bb463dca5aa61ea5f02f7592d5a3cc82cf6f4 +Author: Yehuda Sadeh <yehuda@inktank.com> +Date: Wed Jun 26 11:28:57 2013 -0700 + + rgw: fix radosgw-admin buckets list + + Fixes: #5455 + Backport: cuttlefish + This commit fixes a regression, where radosgw-admin buckets list + operation wasn't returning any data. + + Signed-off-by: Yehuda Sadeh <yehuda@inktank.com> + Reviewed-by: Sage Weil <sage@inktank.com> + (cherry picked from commit e1f9fe58d2860fcbb18c92d3eb3946236b49a6ce) + +commit b2fb48762f32279e73feb83b220339fea31275e9 +Author: Sage Weil <sage@inktank.com> +Date: Wed Jun 19 17:27:49 2013 -0700 + + ceph-disk: use unix lock instead of lockfile class + + The lockfile class relies on file system trickery to get safe mutual + exclusion. However, the unix syscalls do this for us. More + importantly, the unix locks go away when the owning process dies, which + is behavior that we want here. + + Fixes: #5387 + Backport: cuttlefish + Signed-off-by: Sage Weil <sage@inktank.com> + Reviewed-by: Dan Mick <dan.mick@inktank.com> + (cherry picked from commit 2a4953b697a3464862fd3913336edfd7eede2487) + +commit 26e7a6fffde4abcb685f34247e8491c05ee2a68d +Author: Sage Weil <sage@inktank.com> +Date: Wed Jun 26 18:27:49 2013 -0700 + + ceph-disk: do not mount over an osd directly in /var/lib/ceph/osd/$cluster-$id + + If we see a 'ready' file in the target OSD dir, do not mount our device + on top of it. + + Among other things, this prevents ceph-disk activate on stray disks from + stepping on teuthology osds. + + Fixes: #5445 + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit 8a17f33b14d858235dfeaa42be1f4842dcfd66d2) + +commit ccb3dd5ad5533ca4e9b656b4e3df31025a5f2017 +Author: Yan, Zheng <zheng.z.yan@intel.com> +Date: Tue Apr 2 15:46:51 2013 +0800 + + mds: fix underwater dentry cleanup + + If the underwater dentry is a remove link, we shouldn't mark the + inode clean + + Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com> + (cherry picked from commit 81d073fecb58e2294df12b71351321e6d2e69652) + +commit 3020c5ea07a91475a7261dc2b810f5b61a1ae1f2 +Author: Sage Weil <sage@inktank.com> +Date: Mon Jun 24 18:51:07 2013 -0700 + + mon/Elector: cancel election timer if we bootstrap + + If we short-circuit and bootstrap, cancel our timer. Otherwise it will + go off some time later when we are in who knows what state. + + Backport: cuttlefish + Signed-off-by: Sage Weil <sage@inktank.com> + Reviewed-by: Joao Eduardo Luis <joao.luis@inktank.com> + (cherry picked from commit 9ae0ec83dabe37ac15e5165559debdfef7a5f91d) + +commit 305f0c50a5f0ffabc73e10bdf4590217d5d5d211 +Author: Sage Weil <sage@inktank.com> +Date: Mon Jun 24 18:12:11 2013 -0700 + + mon: cancel probe timeout on reset + + If we are probing and get (say) an election timeout that calls reset(), + cancel the timer. Otherwise, we assert later with a splat like + + 2013-06-24 01:09:33.675882 7fb9627e7700 4 mon.b@0(leader) e1 probe_timeout 0x307a520 + 2013-06-24 01:09:33.676956 7fb9627e7700 -1 mon/Monitor.cc: In function 'void Monitor::probe_timeout(int)' thread 7fb9627e7700 time 2013-06-24 01:09:43.675904 + mon/Monitor.cc: 1888: FAILED assert(is_probing() || is_synchronizing()) + + ceph version 0.64-613-g134d08a (134d08a9654f66634b893d493e4a92f38acc63cf) + 1: (Monitor::probe_timeout(int)+0x161) [0x56f5c1] + 2: (Context::complete(int)+0xa) [0x574a2a] + 3: (SafeTimer::timer_thread()+0x425) [0x7059a5] + 4: (SafeTimerThread::entry()+0xd) [0x7065dd] + 5: (()+0x7e9a) [0x7fb966f62e9a] + 6: (clone()+0x6d) [0x7fb9652f9ccd] + NOTE: a copy of the executable, or `objdump -rdS <executable>` is needed to interpret this. + + Fixes: #5438 + Backport: cuttlefish + Signed-off-by: Sage Weil <sage@inktank.com> + Reviewed-by: Joao Eduardo Luis <joao.luis@inktank.com> + (cherry picked from commit 03d3be3eaa96a8e72754c36abd6f355c68d52d59) + +commit a8f601d543168f4cdbddf674479d8de4b8dfc732 +Author: Alexandre Maragone <alexandre.marangone@inktank.com> +Date: Tue Jun 18 16:18:01 2013 -0700 + + ceph-disk: make list_partition behave with unusual device names + + When you get device names like sdaa you do not want to mistakenly conclude that + sdaa is a partition of sda. Use /sys/block/$device/$partition existence + instead. + + Fixes: #5211 + Backport: cuttlefish + Signed-off-by: Alexandre Maragone <alexandre.maragone@inktank.com> + Reviewed-by: Sage Weil <sage@inktank.com> + (cherry picked from commit 8c0daafe003935881c5192e0b6b59b949269e5ae) + +commit 1c890f5cdfc596588e54fffeb016b4a5e9e2124c +Author: Sage Weil <sage@inktank.com> +Date: Mon Jun 17 20:28:24 2013 -0700 + + client: fix warning + + client/Client.cc: In member function 'virtual void Client::ms_handle_remote_reset(Connection*)': + warning: client/Client.cc:7892:9: enumeration value 'STATE_NEW' not handled in switch [-Wswitch] + warning: client/Client.cc:7892:9: enumeration value 'STATE_OPEN' not handled in switch [-Wswitch] + warning: client/Client.cc:7892:9: enumeration value 'STATE_CLOSED' not handled in switch [-Wswitch] + + Signed-off-by: Sage Weil <sage@inktank.com> + Reviewed-by: David Zafman <david.zafman@inktank.com> + (cherry picked from commit 8bd936f077530dfeb2e699164e4492b1c0973088) + +commit c3b97591fd8206825bcfe65bdb24fbc75a2a9b42 +Author: Sage Weil <sage@inktank.com> +Date: Mon Jun 24 17:58:48 2013 -0700 + + mon/AuthMonitor: ensure initial rotating keys get encoded when create_initial called 2x + + The create_initial() method may get called multiple times; make sure it + will unconditionally generate new/initial rotating keys. Move the block + up so that we can easily assert as much. + + Broken by commit cd98eb0c651d9ee62e19c2cc92eadae9bed678cd. + + Signed-off-by: Sage Weil <sage@inktank.com> + Reviewed-by: Yehuda Sadeh <yehuda@inktank.com> + (cherry picked from commit 521fdc2a4e65559b3da83283e6ca607b6e55406f) + +commit 0cc826c385edb2e327505696491d3ff1c3bfe8fd +Author: Sage Weil <sage@inktank.com> +Date: Mon Jun 24 17:42:04 2013 -0700 + + init-radosgw.sysv: remove -x debug mode + + Fixes: #5443 + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit 31d6062076fdbcd2691c07a23b381b26abc59f65) + +commit 4d57c12faceb7f591f10776c6850d98da55c667b +Author: Sage Weil <sage@inktank.com> +Date: Mon Jun 24 12:52:44 2013 -0700 + + common/pick_addresses: behave even after internal_safe_to_start_threads + + ceph-mon recently started using Preforker to working around forking issues. + As a result, internal_safe_to_start_threads got set sooner and calls to + pick_addresses() which try to set string config values now fail because + there are no config observers for them. + + Work around this by observing the change while we adjust the value. We + assume pick_addresses() callers are smart enough to realize that their + result will be reflected by cct->_conf and not magically handled elsewhere. + + Fixes: #5195, #5205 + Backport: cuttlefish + Signed-off-by: Sage Weil <sage@inktank.com> + Reviewed-by: Dan Mick <dan.mick@inktank.com> + (cherry picked from commit eb86eebe1ba42f04b46f7c3e3419b83eb6fe7f9a) + +commit e1ac7c6c3ca673d08710829aa5a3c03735710486 +Author: Sage Weil <sage@inktank.com> +Date: Thu Jun 20 15:39:23 2013 -0700 + + mon/PaxosService: allow paxos service writes while paxos is updating + + In commit f985de28f86675e974ac7842a49922a35fe24c6c I mistakenly made + is_writeable() false while paxos was updating due to a misread of + Paxos::propose_new_value() (I didn't see that it would queue). + This is problematic because it narrows the window during which each service + is writeable for no reason. + + Allow service to be writeable both when paxos is active and updating. + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit 11169693d086e67dcf168ce65ef6e13eebd1a1ab) + +commit 02b0b4a9acb439b2ee5deadc8b02492006492931 +Author: Sage Weil <sage@inktank.com> +Date: Fri Jun 7 11:41:21 2013 -0700 + + mon/PaxosService: not active during paxos UPDATING_PREVIOUS + + Treat this as an extension of the recovery process, e.g. + + RECOVERING -> ACTIVE + or + RECOVERING -> UPDATING_PREVIOUS -> ACTIVE + + and we are not active until we get to "the end" in both cases. + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit 392a8e21f8571b410c85be2129ef62dd6fc52b54) + +commit c6d5dc4d47838c8c8f4d059b7d018dea3f9c4425 +Author: Sage Weil <sage@inktank.com> +Date: Fri Jun 7 11:40:22 2013 -0700 + + mon: simplify states + + - make states mutually exclusive (an enum) + - rename locked -> updating_previous + - set state prior to begin() to simplify things a bit + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit ee34a219605d1943740fdae0d84cfb9020302dd6) + +commit c43b1f4dff254df96144b0b4d569cc72421a8fff +Author: Sage Weil <sage@inktank.com> +Date: Fri Jun 7 11:14:58 2013 -0700 + + mon/Paxos: not readable when LOCKED + + If we are re-proposing a previously accepted value from a previous quorum, + we should not consider it readable, because it is possible it was exposed + to clients as committed (2/3 accepted) but not recored to be committed, and + we do not want to expose old state as readable when new state was + previously readable. + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit ec2ea86ed55e00265c2cc5ad0c94460b4c92865c) + +commit 10d41200622d76dbf276602828584e7153cb22b5 +Author: Sage Weil <sage@inktank.com> +Date: Fri Jun 7 11:07:38 2013 -0700 + + mon/Paxos: cleanup: drop unused PREPARING state bit + + This is never set when we block, and nobody looks at it. + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit 7b7ea8e30e20704caad9a841332ecb2e39819a41) + +commit 9d7c40e3f4ea2dd969aa0264ea8a6ad74f3e678a +Author: Sage Weil <sage@inktank.com> +Date: Thu Jun 6 15:20:05 2013 -0700 + + mon/PaxosService: simplify is_writeable + + Recast this in terms of paxos check + our conditions, and make it + match wait_for_writeable(). + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit f985de28f86675e974ac7842a49922a35fe24c6c) + +commit 35745cba8985c5f3238e3c28fd28b194fae043d9 +Author: Sage Weil <sage@inktank.com> +Date: Tue Jun 4 17:03:15 2013 -0700 + + mon/PaxosService: simplify readable check + + Recast this in terms of the paxos check and our additional conditions, + which match wait_for_readable(). + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit 3aa61a0beb540e48bf61ceded766d6ff52c95eb2) + +commit 57c89291a48c319907fb3029746d9f5a4bd9dd61 +Author: Sage Weil <sage@inktank.com> +Date: Fri May 31 16:45:08 2013 -0700 + + mon: simplify Monitor::init_paxos() + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit e832e76a4af04b091c806ad412bcfd0326d75a2d) + +commit fd1769cb2d61e8f2c7921a78760e8f12b28258fb +Author: Sage Weil <sage@inktank.com> +Date: Fri May 31 16:39:37 2013 -0700 + + mon/Paxos: go active *after* refreshing + + The update_from_paxos() methods occasionally like to trigger new activity. + As long as they check is_readable() and is_writeable(), they will defer + until we go active and that activity will happen in the normal callbacks. + + This fixes the problem where we active but is_writeable() is still false, + triggered by PGMonitor::check_osd_map(). + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit e68b1bd36ed285e38a558899f83cf224d3aa60ed) + +commit cf75478d027dfd377424988745230d096dae79ac +Author: Sage Weil <sage@inktank.com> +Date: Fri May 31 15:32:06 2013 -0700 + + mon: safely signal bootstrap from MonmapMonitor::update_from_paxos() + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit dc83430124a5fd37573202a4cc0986c3c03739ef) + +commit 6ac58cd9c1f9c80c5f3cbe97e19cfcd8427db46d +Author: Sage Weil <sage@inktank.com> +Date: Sun Jun 2 16:57:11 2013 -0700 + + mon/Paxos: do paxos refresh in finish_proposal; and refactor + + Do the paxos refresh inside finish_proposal, ordered *after* the leader + assertion so that MonmapMonitor::update_from_paxos() calling bootstrap() + does not kill us. + + Also, remove unnecessary finish_queued_proposal() and move the logic inline + where the bad leader assertion is obvious. + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit a42d7582f816b45f5d19c393fd45447555e78fdd) + +commit 054e96d96533b1c4078402e43184f13b97329905 +Author: Joao Eduardo Luis <joao.luis@inktank.com> +Date: Sun Jun 2 16:15:02 2013 -0700 + + mon/PaxosService: cache {first,last}_committed + + Refresh the in-memory values when we are told the on-disk paxos state + may have changed. + + Signed-off-by: Joao Eduardo Luis <joao.luis@inktank.com> + (cherry picked from commit 2fccb300bdf6ffd44db3462eb05115da11322ed4) + +commit 265212a7384399bf85e15e6978bc7543824c0e92 +Author: Sage Weil <sage@inktank.com> +Date: Fri May 31 14:30:48 2013 -0700 + + mon: no need to refresh from _active + + The refresh is done explicitly by the monitor, independent of the more + fragile PaxosService callbacks. + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit d941363d6e4249e97b64faff0e573f75e918ac0c) + +commit 1d8662504299babec22c714662cefbb86a0acb8b +Author: Sage Weil <sage@inktank.com> +Date: Sun Jun 2 16:10:57 2013 -0700 + + mon: remove unnecessary update_from_paxos calls + + The refresh() will do this when the state changes; no need to + opportunistically call this method all of the time. + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit 03014a4ecc06cde420fad0c6c2a0177ebd7b839d) + +commit 34acc5a3161b6bcda2b9f7ce18d89a8618fff1c5 +Author: Sage Weil <sage@inktank.com> +Date: Sun Jun 2 16:14:01 2013 -0700 + + mon: explicitly refresh_from_paxos() when leveldb state changes + + Instead of opportunistically calling each service's update_from_paxos(), + instead explicitly refresh all in-memory state whenever we know the + paxos state may have changed. This is simpler and less fragile. + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit cc339c07312006e65854207523f50542d00ecf87) + +commit 4474a0cc6c009a566ecf46efadb39d80343a7c68 +Author: Sage Weil <sage@inktank.com> +Date: Sun Jun 23 09:25:55 2013 -0700 + + mon/AuthMonitor: make initial auth include rotating keys + + This closes a very narrow race during mon creation where there are no + service keys. + + Fixes: #5427 + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit cd98eb0c651d9ee62e19c2cc92eadae9bed678cd) + +commit d572cf6f77418f217a5a8e37f1124dc566e24d0b +Author: Sage Weil <sage@inktank.com> +Date: Fri Jun 21 11:53:29 2013 -0700 + + mds: fix iterator invalidation for backtrace removal + + - Don't increment before we dereference! + - We need to update the iterator before we delete the item. + + This code is changed in master, so this fix is for cuttlefish only. + + Signed-off-by: Sage Weil <sage@inktank.com> + Reviewed-by: Greg Farnum <greg@inktank.com> + +commit 50957772c3582290331f69ba4a985b1cdf86834d +Author: Sage Weil <sage@inktank.com> +Date: Thu May 9 09:44:20 2013 -0700 + + osd: init test_ops_hook + + CID 1019628 (#1 of 1): Uninitialized pointer field (UNINIT_CTOR) + 2. uninit_member: Non-static class member "test_ops_hook" is not initialized in this constructor nor in any functions that it calls. + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit e30a03210c3efb768b1653df5ae58917ef26e579) + +commit 17d2745f095e7bb640dece611d7824d370ea3b81 +Author: Sage Weil <sage@inktank.com> +Date: Thu May 9 09:45:51 2013 -0700 + + osd: initialize OSDService::next_notif_id + + CID 1019627 (#1 of 1): Uninitialized scalar field (UNINIT_CTOR) + 2. uninit_member: Non-static class member "next_notif_id" is not initialized in this constructor nor in any functions that it calls. + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit 499edd8bfc355c2d590f5fa1ef197d1ea5680351) + +commit ffdb7236a994aa20b5f75860b9c81dac0f131f9a +Author: Sage Weil <sage@inktank.com> +Date: Thu Jun 20 09:46:42 2013 -0700 + + mon: more fix dout use in sync_requester_abort() + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit d60534b8f59798feaeeaa17adba2a417d7777cbf) + +commit 38ddae04bb974a93f1718c509363f1afbe6b612d +Author: Sage Weil <sage@inktank.com> +Date: Mon Jun 10 11:48:25 2013 -0700 + + mon: fix raw use of *_dout in sync_requester_abort() + + Signed-off-by: Sage Weil <sage@inktank.com> + (cherry picked from commit 8a4ed58e39b287fd8667c62b45848487515bdc80) diff --git a/doc/install/rpm.rst b/doc/install/rpm.rst index d5d6bf196c2..d96628a0f95 100644 --- a/doc/install/rpm.rst +++ b/doc/install/rpm.rst @@ -16,6 +16,27 @@ release key to your system's list of trusted keys to avoid a security warning:: sudo rpm --import 'https://ceph.com/git/?p=ceph.git;a=blob_plain;f=keys/release.asc' +Install Prerequisites +===================== + +Ceph may require additional additional third party libraries. +To add the EPEL repository, execute the following:: + + su -c 'rpm -Uvh http://dl.fedoraproject.org/pub/epel/6/x86_64/epel-release-6-8.noarch.rpm' + +Some releases of Ceph require the following packages: + +- snappy +- leveldb +- gdisk +- python-argparse +- gperftools-libs + +To install these packages, execute the following:: + + sudo yum install snappy leveldb gdisk python-argparse gperftools-libs + + Add Release Packages ==================== @@ -31,13 +52,9 @@ Packages are currently built for the RHEL/CentOS6 (``el6``), Fedora 17 platforms. The repository package installs the repository details on your local system for use with ``yum`` or ``up2date``. -Replase the``{DISTRO}`` below with the distro codename:: - - su -c 'rpm -Uvh http://ceph.com/rpm-cuttlefish/{DISTRO}/x86_64/ceph-release-1-0.el6.noarch.rpm' - For example, for CentOS 6 or other RHEL6 derivatives (``el6``):: - su -c 'rpm -Uvh http://ceph.com/rpm-cuttlefish/el6/x86_64/ceph-release-1-0.el6.noarch.rpm' + su -c 'rpm -Uvh http://ceph.com/rpm-cuttlefish/el6/noarch/ceph-release-1-0.el6.noarch.rpm' You can download the RPMs directly from:: @@ -99,11 +116,23 @@ You can download the RPMs directly from:: http://ceph.com/rpm-testing + +Installing Ceph Deploy +====================== + +Once you have added either release or development packages to ``yum``, you +can install ``ceph-deploy``. :: + + sudo yum install ceph-deploy python-pushy + + + Installing Ceph Packages ======================== Once you have added either release or development packages to ``yum``, you -can install Ceph:: +can install Ceph packages. You can also use ``ceph-deploy`` to install Ceph +packages. :: sudo yum install ceph @@ -198,7 +227,7 @@ Installing Ceph Object Storage #. Create a user key. :: ceph-authtool -C -n client.radosgw.gateway --gen-key /etc/ceph/keyring.radosgw.gateway - ceph-authtool -n client.radosgw.gateway --cap mon 'allow r' --cap osd 'allow rwx' /etc/ceph/keyring.radosgw.gateway + ceph-authtool -n client.radosgw.gateway --cap mon 'allow rw' --cap osd 'allow rwx' /etc/ceph/keyring.radosgw.gateway ceph auth add client.radosgw.gateway --in-file=/etc/ceph/keyring.radosgw.gateway diff --git a/doc/man/8/radosgw.rst b/doc/man/8/radosgw.rst index 46511f9afe6..0fb114973f5 100644 --- a/doc/man/8/radosgw.rst +++ b/doc/man/8/radosgw.rst @@ -86,7 +86,7 @@ You will also have to generate a key for the radosgw to use for authentication with the cluster:: ceph-authtool -C -n client.radosgw.gateway --gen-key /etc/ceph/keyring.radosgw.gateway - ceph-authtool -n client.radosgw.gateway --cap mon 'allow r' --cap osd 'allow rwx' /etc/ceph/keyring.radosgw.gateway + ceph-authtool -n client.radosgw.gateway --cap mon 'allow rw' --cap osd 'allow rwx' /etc/ceph/keyring.radosgw.gateway And add the key to the auth entries:: diff --git a/doc/rados/operations/authentication.rst b/doc/rados/operations/authentication.rst index d56f6ef584a..0b71d08b0c4 100644 --- a/doc/rados/operations/authentication.rst +++ b/doc/rados/operations/authentication.rst @@ -196,7 +196,7 @@ capabilities necessary for the daemon to function, are shown below. ``radosgw`` :Location: ``$rgw_data/keyring`` -:Capabilities: ``mon 'allow r' osd 'allow rwx'`` +:Capabilities: ``mon 'allow rw' osd 'allow rwx'`` Note that the monitor keyring contains a key but no capabilities, and diff --git a/doc/radosgw/config.rst b/doc/radosgw/config.rst index d7526fdd776..615a979fb5d 100644 --- a/doc/radosgw/config.rst +++ b/doc/radosgw/config.rst @@ -164,7 +164,7 @@ Generate a key so that RADOS Gateway can identify a user name and authenticate the user with the cluster. Then, add capabilities to the key. For example:: sudo ceph-authtool /etc/ceph/keyring.radosgw.gateway -n client.radosgw.gateway --gen-key - sudo ceph-authtool -n client.radosgw.gateway --cap osd 'allow rwx' --cap mon 'allow r' /etc/ceph/keyring.radosgw.gateway + sudo ceph-authtool -n client.radosgw.gateway --cap osd 'allow rwx' --cap mon 'allow rw' /etc/ceph/keyring.radosgw.gateway Add to Ceph Keyring Entries @@ -173,7 +173,7 @@ Add to Ceph Keyring Entries Once you have created a keyring and key for RADOS GW, add it as an entry in the Ceph keyring. For example:: - sudo ceph -k /etc/ceph/ceph.keyring auth add client.radosgw.gateway -i /etc/ceph/keyring.radosgw.gateway + sudo ceph -k /etc/ceph/ceph.client.admin.keyring auth add client.radosgw.gateway -i /etc/ceph/keyring.radosgw.gateway Restart Services and Start the RADOS Gateway diff --git a/doc/rbd/rbd-snapshot.rst b/doc/rbd/rbd-snapshot.rst index 9b209777df5..0152258df63 100644 --- a/doc/rbd/rbd-snapshot.rst +++ b/doc/rbd/rbd-snapshot.rst @@ -287,13 +287,13 @@ Listing Children of a Snapshot To list the children of a snapshot, execute the following:: - rbd --pool {pool-name} snap children --image {image-name} --snap {snap-name} - rbd snap children {pool-name}/{image-name}@{snapshot-name} + rbd --pool {pool-name} children --image {image-name} --snap {snap-name} + rbd children {pool-name}/{image-name}@{snapshot-name} For example:: - rbd --pool rbd snap children --image my-image --snap my-snapshot - rbd snap children rbd/my-image@my-snapshot + rbd --pool rbd children --image my-image --snap my-snapshot + rbd children rbd/my-image@my-snapshot Flattening a Cloned Image diff --git a/doc/release-notes.rst b/doc/release-notes.rst index ba3b9be8363..f5e76febbac 100644 --- a/doc/release-notes.rst +++ b/doc/release-notes.rst @@ -15,7 +15,7 @@ Upgrading Notable changes ~~~~~~~~~~~~~~~ -* osd: pg log (re)writes are not vastly more efficient (faster peering) (Sam Just) +* osd: pg log (re)writes are now vastly more efficient (faster peering) (Sam Just) * osd: fixed problem with front-side heartbeats and mixed clusters (David Zafman) * mon: tuning, performance improvements * mon: simplify PaxosService vs Paxos interaction, fix readable/writeable checks @@ -195,6 +195,51 @@ Notable Changes * misc code cleanups +v0.61.5 "Cuttlefish" +-------------------- + +This release most improves stability of the monitor and fixes a few +bugs with the ceph-disk utility (used by ceph-deploy). We recommand +that all v0.61.x users upgrade. + +Upgrading +~~~~~~~~~ + +* This release fixes a 32-bit vs 64-bit arithmetic bug with the + feature bits. An unfortunate consequence of the fix is that 0.61.4 + (or earlier) ceph-mon daemons can't form a quorum with 0.61.5 (or + later) monitors. To avoid the possibility of service disruption, we + recommend you upgrade all monitors at once. + +Notable Changes +~~~~~~~~~~~~~~~ + +* mon: misc sync improvements (faster, more reliable, better tuning) +* mon: enable leveldb cache by default (big performance improvement) +* mon: new scrub feature (primarily for diagnostic, testing purposes) +* mon: fix occasional leveldb assertion on startup +* mon: prevent reads until initial state is committed +* mon: improved logic for trimming old osdmaps +* mon: fix pick_addresses bug when expanding mon cluster +* mon: several small paxos fixes, improvements +* mon: fix bug osdmap trim behavior +* osd: fix several bugs with PG stat reporting +* osd: limit number of maps shared with peers (which could cause domino failures) +* rgw: fix radosgw-admin buckets list (for all buckets) +* mds: fix occasional client failure to reconnect +* mds: fix bad list traversal after unlink +* mds: fix underwater dentry cleanup (occasional crash after mds restart) +* libcephfs, ceph-fuse: fix occasional hangs on umount +* libcephfs, ceph-fuse: fix old bug with O_LAZY vs O_NOATIME confusion +* ceph-disk: more robust journal device detection on RHEL/CentOS +* ceph-disk: better, simpler locking +* ceph-disk: do not inadvertantely mount over existing osd mounts +* ceph-disk: better handling for unusual device names +* sysvinit, upstart: handle symlinks in /var/lib/ceph/* + +For more detailed information, see :download:`the complete changelog <changelog/v0.61.5.txt>`. + + v0.61.4 "Cuttlefish" -------------------- @@ -232,7 +277,7 @@ Notable Changes * ceph-fuse: fix thread creation on startup * all daemons: create /var/run/ceph directory on startup if missing -For more detailed information, see :download:`the complete changelog <changelog/v0.61.3.txt>`. +For more detailed information, see :download:`the complete changelog <changelog/v0.61.4.txt>`. v0.61.3 "Cuttlefish" diff --git a/src/ceph.in b/src/ceph.in index 6ba92c99b18..e6806786e7e 100755 --- a/src/ceph.in +++ b/src/ceph.in @@ -118,6 +118,8 @@ def parse_cmdargs(args=None, target=''): parser.add_argument('--admin-daemon', dest='admin_socket', help='submit admin-socket commands (\"help\" for help') + parser.add_argument('--admin-socket', dest='admin_socket_nope', + help='you probably mean --admin-daemon') parser.add_argument('-s', '--status', action='store_true', help='show cluster status') @@ -395,9 +397,9 @@ def find_cmd_target(childargs): right daemon. Returns ('osd', osdid), ('pg', pgid), or ('mon', '') """ - sig = parse_funcsig(['tell', {'name':'target','type':'CephName'}]) + sig = parse_funcsig(['tell', {'name':'target', 'type':'CephName'}]) try: - valid_dict = validate(childargs, sig, partial=True); + valid_dict = validate(childargs, sig, partial=True) if len(valid_dict) == 2: name = CephName() name.valid(valid_dict['target']) @@ -405,9 +407,9 @@ def find_cmd_target(childargs): except ArgumentError: pass - sig = parse_funcsig(['pg', {'name':'pgid','type':'CephPgid'}]) + sig = parse_funcsig(['pg', {'name':'pgid', 'type':'CephPgid'}]) try: - valid_dict = validate(childargs, sig, partial=True); + valid_dict = validate(childargs, sig, partial=True) if len(valid_dict) == 2: return 'pg', valid_dict['pgid'] except ArgumentError: @@ -489,6 +491,11 @@ def main(): global verbose verbose = parsed_args.verbose + if parsed_args.admin_socket_nope: + print >> sys.stderr, '--admin-socket is used by daemons; '\ + 'you probably mean --admin-daemon/daemon' + return 1 + # pass on --id, --name, --conf name = 'client.admin' if parsed_args.client_id: @@ -582,7 +589,7 @@ def main(): # implement -w/--watch_* # This is ugly, but Namespace() isn't quite rich enough. level = '' - for k,v in parsed_args._get_kwargs(): + for k, v in parsed_args._get_kwargs(): if k.startswith('watch') and v: if k == 'watch': level = 'info' @@ -670,8 +677,8 @@ def main(): prefix = '' suffix = '' if not parsed_args.output_file and len(targets) > 1: - prefix='{0}.{1}: '.format(*target) - suffix='\n' + prefix = '{0}.{1}: '.format(*target) + suffix = '\n' ret, outbuf, outs = json_command(cluster_handle, target=target, prefix='get_command_descriptions') @@ -733,7 +740,7 @@ def main(): if parsed_args.output_format and \ parsed_args.output_format.startswith('json') and \ not compat: - sys.stdout.write('\n'); + sys.stdout.write('\n') # if we are prettifying things, normalize newlines. sigh. if suffix != '': diff --git a/src/ceph_mds.cc b/src/ceph_mds.cc index edb48bd96d8..88b807b1b24 100644 --- a/src/ceph_mds.cc +++ b/src/ceph_mds.cc @@ -274,12 +274,6 @@ int main(int argc, const char **argv) messenger->start(); - // set up signal handlers, now that we've daemonized/forked. - init_async_signal_handler(); - register_async_signal_handler(SIGHUP, sighup_handler); - register_async_signal_handler_oneshot(SIGINT, handle_mds_signal); - register_async_signal_handler_oneshot(SIGTERM, handle_mds_signal); - // start mds mds = new MDS(g_conf->name.get_id().c_str(), messenger, &mc); @@ -291,16 +285,26 @@ int main(int argc, const char **argv) r = mds->init(shadow); else r = mds->init(); + if (r < 0) + goto shutdown; - if (r >= 0) { - messenger->wait(); - } + // set up signal handlers, now that we've daemonized/forked. + init_async_signal_handler(); + register_async_signal_handler(SIGHUP, sighup_handler); + register_async_signal_handler_oneshot(SIGINT, handle_mds_signal); + register_async_signal_handler_oneshot(SIGTERM, handle_mds_signal); + + if (g_conf->inject_early_sigterm) + kill(getpid(), SIGTERM); + + messenger->wait(); unregister_async_signal_handler(SIGHUP, sighup_handler); unregister_async_signal_handler(SIGINT, handle_mds_signal); unregister_async_signal_handler(SIGTERM, handle_mds_signal); shutdown_async_signal_handler(); + shutdown: // yuck: grab the mds lock, so we can be sure that whoever in *mds // called shutdown finishes what they were doing. mds->mds_lock.Lock(); @@ -313,14 +317,15 @@ int main(int argc, const char **argv) if (mds->is_stopped()) delete mds; + g_ceph_context->put(); + // cd on exit, so that gmon.out (if any) goes into a separate directory for each node. char s[20]; snprintf(s, sizeof(s), "gmon/%d", getpid()); if ((mkdir(s, 0755) == 0) && (chdir(s) == 0)) { - dout(0) << "ceph-mds: gmon.out should be in " << s << dendl; + cerr << "ceph-mds: gmon.out should be in " << s << std::endl; } - generic_dout(0) << "stopped." << dendl; return 0; } diff --git a/src/ceph_mon.cc b/src/ceph_mon.cc index 6ac22ba20e5..35ed56a7985 100644 --- a/src/ceph_mon.cc +++ b/src/ceph_mon.cc @@ -542,15 +542,18 @@ int main(int argc, const char **argv) if (g_conf->daemonize) prefork.daemonize(); + messenger->start(); + + mon->init(); + // set up signal handlers, now that we've daemonized/forked. init_async_signal_handler(); register_async_signal_handler(SIGHUP, sighup_handler); register_async_signal_handler_oneshot(SIGINT, handle_mon_signal); register_async_signal_handler_oneshot(SIGTERM, handle_mon_signal); - messenger->start(); - - mon->init(); + if (g_conf->inject_early_sigterm) + kill(getpid(), SIGTERM); messenger->wait(); diff --git a/src/ceph_osd.cc b/src/ceph_osd.cc index b485133514e..d8590bff817 100644 --- a/src/ceph_osd.cc +++ b/src/ceph_osd.cc @@ -451,12 +451,6 @@ int main(int argc, const char **argv) messenger_hb_back_server->start(); cluster_messenger->start(); - // install signal handlers - init_async_signal_handler(); - register_async_signal_handler(SIGHUP, sighup_handler); - register_async_signal_handler_oneshot(SIGINT, handle_osd_signal); - register_async_signal_handler_oneshot(SIGTERM, handle_osd_signal); - // start osd err = osd->init(); if (err < 0) { @@ -465,6 +459,15 @@ int main(int argc, const char **argv) return 1; } + // install signal handlers + init_async_signal_handler(); + register_async_signal_handler(SIGHUP, sighup_handler); + register_async_signal_handler_oneshot(SIGINT, handle_osd_signal); + register_async_signal_handler_oneshot(SIGTERM, handle_osd_signal); + + if (g_conf->inject_early_sigterm) + kill(getpid(), SIGTERM); + client_messenger->wait(); messenger_hbclient->wait(); messenger_hb_front_server->wait(); diff --git a/src/client/SyntheticClient.cc b/src/client/SyntheticClient.cc index 79171da46f1..cb211f5461b 100644 --- a/src/client/SyntheticClient.cc +++ b/src/client/SyntheticClient.cc @@ -1470,8 +1470,7 @@ int SyntheticClient::play_trace(Trace& t, string& prefix, bool metadata_only) dout(10) << "trace finished on line " << t.get_line() << dendl; // wait for safe after an object trace - safegref->finish(0); - delete safegref; + safegref->complete(0); lock.Lock(); while (!safe) { dout(10) << "waiting for safe" << dendl; diff --git a/src/client/fuse_ll.cc b/src/client/fuse_ll.cc index 83395534d0d..ce0c6de1260 100644 --- a/src/client/fuse_ll.cc +++ b/src/client/fuse_ll.cc @@ -14,6 +14,7 @@ #define FUSE_USE_VERSION 26 +#include <fuse/fuse.h> #include <fuse/fuse_lowlevel.h> #include <signal.h> #include <stdio.h> @@ -520,7 +521,7 @@ static int getgroups_cb(void *handle, uid_t uid, gid_t **sgids) return 0; } - *sgids = malloc(c*sizeof(**sgids)); + *sgids = (gid_t*)malloc(c*sizeof(**sgids)); if (!*sgids) { return -ENOMEM; } diff --git a/src/common/config_opts.h b/src/common/config_opts.h index defb71ee514..b43808e211c 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -123,6 +123,8 @@ OPTION(ms_inject_delay_max, OPT_DOUBLE, 1) // seconds OPTION(ms_inject_delay_probability, OPT_DOUBLE, 0) // range [0, 1] OPTION(ms_inject_internal_delays, OPT_DOUBLE, 0) // seconds +OPTION(inject_early_sigterm, OPT_BOOL, false) + OPTION(mon_data, OPT_STR, "/var/lib/ceph/mon/$cluster-$id") OPTION(mon_initial_members, OPT_STR, "") // list of initial cluster mon ids; if specified, need majority to form initial quorum and create new cluster OPTION(mon_sync_fs_threshold, OPT_INT, 5) // sync() when writing this many objects; 0 to disable. @@ -207,6 +209,7 @@ OPTION(paxos_trim_min, OPT_INT, 250) // number of extra proposals tolerated bef OPTION(paxos_trim_max, OPT_INT, 500) // max number of extra proposals to trim at a time OPTION(paxos_service_trim_min, OPT_INT, 250) // minimum amount of versions to trigger a trim (0 disables it) OPTION(paxos_service_trim_max, OPT_INT, 500) // maximum amount of versions to trim during a single proposal (0 disables it) +OPTION(paxos_kill_at, OPT_INT, 0) OPTION(clock_offset, OPT_DOUBLE, 0) // how much to offset the system clock in Clock.cc OPTION(auth_cluster_required, OPT_STR, "cephx") // required of mon, mds, osd daemons OPTION(auth_service_required, OPT_STR, "cephx") // required by daemons of clients @@ -675,6 +678,8 @@ OPTION(rgw_md_log_max_shards, OPT_INT, 64) // max shards for metadata log OPTION(rgw_num_zone_opstate_shards, OPT_INT, 128) // max shards for keeping inter-region copy progress info OPTION(rgw_opstate_ratelimit_sec, OPT_INT, 30) // min time between opstate updates on a single upload (0 for disabling ratelimit) OPTION(rgw_curl_wait_timeout_ms, OPT_INT, 1000) // timeout for certain curl calls +OPTION(rgw_copy_obj_progress, OPT_BOOL, true) // should dump progress during long copy operations? +OPTION(rgw_copy_obj_progress_every_bytes, OPT_INT, 1024 * 1024) // min bytes between copy progress output OPTION(rgw_data_log_window, OPT_INT, 30) // data log entries window (in seconds) OPTION(rgw_data_log_changes_size, OPT_INT, 1000) // number of in-memory entries to hold for data changes log diff --git a/src/include/Context.h b/src/include/Context.h index e31fca6a426..9ec4414a047 100644 --- a/src/include/Context.h +++ b/src/include/Context.h @@ -34,10 +34,12 @@ class Context { Context(const Context& other); const Context& operator=(const Context& other); + protected: + virtual void finish(int r) = 0; + public: Context() {} virtual ~Context() {} // we want a virtual destructor!!! - virtual void finish(int r) = 0; virtual void complete(int r) { finish(r); delete this; diff --git a/src/librbd/LibrbdWriteback.cc b/src/librbd/LibrbdWriteback.cc index a7ab0488dc6..da02a34ed32 100644 --- a/src/librbd/LibrbdWriteback.cc +++ b/src/librbd/LibrbdWriteback.cc @@ -32,8 +32,7 @@ namespace librbd { void context_cb(rados_completion_t c, void *arg) { Context *con = reinterpret_cast<Context *>(arg); - con->finish(rados_aio_get_return_value(c)); - delete con; + con->complete(rados_aio_get_return_value(c)); } /** diff --git a/src/log/Log.cc b/src/log/Log.cc index e06afbfe1e2..afeb1208002 100644 --- a/src/log/Log.cc +++ b/src/log/Log.cc @@ -42,9 +42,6 @@ Log::Log(SubsystemMap *s) { int ret; - ret = pthread_spin_init(&m_lock, PTHREAD_PROCESS_SHARED); - assert(ret == 0); - ret = pthread_mutex_init(&m_flush_mutex, NULL); assert(ret == 0); @@ -73,7 +70,6 @@ Log::~Log() if (m_fd >= 0) TEMP_FAILURE_RETRY(::close(m_fd)); - pthread_spin_destroy(&m_lock); pthread_mutex_destroy(&m_queue_mutex); pthread_mutex_destroy(&m_flush_mutex); pthread_cond_destroy(&m_cond_loggers); diff --git a/src/log/Log.h b/src/log/Log.h index f6a27dc5b37..b5e16fdde79 100644 --- a/src/log/Log.h +++ b/src/log/Log.h @@ -21,7 +21,6 @@ class Log : private Thread SubsystemMap *m_subs; - pthread_spinlock_t m_lock; pthread_mutex_t m_queue_mutex; pthread_mutex_t m_flush_mutex; pthread_cond_t m_cond_loggers; diff --git a/src/mds/AnchorClient.cc b/src/mds/AnchorClient.cc index bcc8710e43f..30cbfd34f74 100644 --- a/src/mds/AnchorClient.cc +++ b/src/mds/AnchorClient.cc @@ -51,8 +51,7 @@ void AnchorClient::handle_query_result(class MMDSTableRequest *m) for (list<_pending_lookup>::iterator q = ls.begin(); q != ls.end(); ++q) { *q->trace = trace; if (q->onfinish) { - q->onfinish->finish(0); - delete q->onfinish; + q->onfinish->complete(0); } } } diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc index 48529948955..d215d18690f 100644 --- a/src/mds/CInode.cc +++ b/src/mds/CInode.cc @@ -914,8 +914,7 @@ void CInode::_stored(version_t v, Context *fin) if (v == get_projected_version()) mark_clean(); - fin->finish(0); - delete fin; + fin->complete(0); } struct C_Inode_Fetched : public Context { @@ -964,13 +963,12 @@ void CInode::_fetched(bufferlist& bl, bufferlist& bl2, Context *fin) if (magic != CEPH_FS_ONDISK_MAGIC) { dout(0) << "on disk magic '" << magic << "' != my magic '" << CEPH_FS_ONDISK_MAGIC << "'" << dendl; - fin->finish(-EINVAL); + fin->complete(-EINVAL); } else { decode_store(p); dout(10) << "_fetched " << *this << dendl; - fin->finish(0); + fin->complete(0); } - delete fin; } void CInode::build_backtrace(int64_t pool, inode_backtrace_t& bt) diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index e592dde96ca..77d3d8b97b8 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -510,8 +510,7 @@ void MDCache::_create_system_file_finish(Mutation *mut, CDentry *dn, version_t d mut->cleanup(); delete mut; - fin->finish(0); - delete fin; + fin->complete(0); //if (dir && MDS_INO_IS_MDSDIR(in->ino())) //migrator->export_dir(dir, (int)in->ino() - MDS_INO_MDSDIR_OFFSET); @@ -3093,8 +3092,7 @@ void MDCache::handle_resolve_ack(MMDSResolveAck *ack) if (mdr->more()->slave_commit) { Context *fin = mdr->more()->slave_commit; mdr->more()->slave_commit = 0; - fin->finish(-1); - delete fin; + fin->complete(-1); } else { if (mdr->slave_request) mdr->aborted = true; @@ -7675,8 +7673,7 @@ public: mdcache(mdc), ino(i), want_xlocked(wx), onfinish(c) {} void finish(int r) { if (mdcache->get_inode(ino)) { - onfinish->finish(0); - delete onfinish; + onfinish->complete(0); } else mdcache->open_remote_ino(ino, onfinish, want_xlocked); } @@ -7703,8 +7700,7 @@ public: if (r == 0) mdcache->open_remote_ino_2(ino, anchortrace, want_xlocked, hadino, hadv, onfinish); else { - onfinish->finish(r); - delete onfinish; + onfinish->complete(r); } } }; @@ -7753,8 +7749,7 @@ void MDCache::open_remote_ino_2(inodeno_t ino, vector<Anchor>& anchortrace, bool if (in->ino() == ino) { // success dout(10) << "open_remote_ino_2 have " << *in << dendl; - onfinish->finish(0); - delete onfinish; + onfinish->complete(0); return; } @@ -7795,8 +7790,7 @@ void MDCache::open_remote_ino_2(inodeno_t ino, vector<Anchor>& anchortrace, bool dout(10) << "expected ino " << anchortrace[i].ino << " in complete dir " << *dir << ", got same anchor " << anchortrace[i] << " 2x in a row" << dendl; - onfinish->finish(-ENOENT); - delete onfinish; + onfinish->complete(-ENOENT); } else { // hrm. requery anchor table. dout(10) << "expected ino " << anchortrace[i].ino @@ -8408,8 +8402,7 @@ void MDCache::_do_find_ino_peer(find_ino_peer_info_t& fip) dout(10) << "_do_find_ino_peer waiting for more peers to be active" << dendl; } else { dout(10) << "_do_find_ino_peer failed on " << fip.ino << dendl; - fip.fin->finish(-ESTALE); - delete fip.fin; + fip.fin->complete(-ESTALE); find_ino_peer.erase(fip.tid); } } else { @@ -8521,8 +8514,7 @@ void MDCache::_find_ino_dir(inodeno_t ino, Context *fin, bufferlist& bl, int r) { dout(10) << "_find_ino_dir " << ino << " got " << r << " " << bl.length() << " bytes" << dendl; if (r < 0) { - fin->finish(r); - delete fin; + fin->complete(r); return; } @@ -8539,8 +8531,7 @@ void MDCache::_find_ino_dir(inodeno_t ino, Context *fin, bufferlist& bl, int r) return; delete c; // path_traverse doesn't clean it up for us for r <= 0 - fin->finish(r); - delete fin; + fin->complete(r); } @@ -8619,8 +8610,7 @@ void MDCache::request_finish(MDRequest *mdr) if (mdr->more()->slave_commit) { Context *fin = mdr->more()->slave_commit; mdr->more()->slave_commit = 0; - fin->finish(0); // this must re-call request_finish. - delete fin; + fin->complete(0); // this must re-call request_finish. return; } diff --git a/src/mds/MDLog.cc b/src/mds/MDLog.cc index b293c4cc10a..3dfc00fc221 100644 --- a/src/mds/MDLog.cc +++ b/src/mds/MDLog.cc @@ -173,8 +173,7 @@ void MDLog::submit_entry(LogEvent *le, Context *c) if (!g_conf->mds_log) { // hack: log is disabled. if (c) { - c->finish(0); - delete c; + c->complete(0); } return; } @@ -245,8 +244,7 @@ void MDLog::wait_for_safe(Context *c) journaler->wait_for_flush(c); } else { // hack: bypass. - c->finish(0); - delete c; + c->complete(0); } } @@ -442,8 +440,7 @@ void MDLog::replay(Context *c) if (journaler->get_read_pos() == journaler->get_write_pos()) { dout(10) << "replay - journal empty, done." << dendl; if (c) { - c->finish(0); - delete c; + c->complete(0); } return; } diff --git a/src/mds/MDS.cc b/src/mds/MDS.cc index a867961ccf3..7dcf68822aa 100644 --- a/src/mds/MDS.cc +++ b/src/mds/MDS.cc @@ -1628,19 +1628,18 @@ void MDS::suicide() } timer.cancel_all_events(); //timer.join(); + timer.shutdown(); // shut down cache mdcache->shutdown(); if (objecter->initialized) objecter->shutdown_locked(); - - // shut down messenger - messenger->shutdown(); monc->shutdown(); - timer.shutdown(); + // shut down messenger + messenger->shutdown(); } void MDS::respawn() @@ -1890,8 +1889,7 @@ bool MDS::_dispatch(Message *m) ls.swap(finished_queue); while (!ls.empty()) { dout(10) << " finish " << ls.front() << dendl; - ls.front()->finish(0); - delete ls.front(); + ls.front()->complete(0); ls.pop_front(); // give other threads (beacon!) a chance diff --git a/src/mds/MDSTable.cc b/src/mds/MDSTable.cc index 4b21f4feaa5..ef0326dfbd3 100644 --- a/src/mds/MDSTable.cc +++ b/src/mds/MDSTable.cc @@ -160,7 +160,6 @@ void MDSTable::load_2(int r, bufferlist& bl, Context *onfinish) decode_state(p); if (onfinish) { - onfinish->finish(0); - delete onfinish; + onfinish->complete(0); } } diff --git a/src/mds/MDSTableClient.cc b/src/mds/MDSTableClient.cc index b4781497068..cc3152f1d67 100644 --- a/src/mds/MDSTableClient.cc +++ b/src/mds/MDSTableClient.cc @@ -61,8 +61,7 @@ void MDSTableClient::handle_request(class MMDSTableRequest *m) pending_prepare.erase(reqid); prepared_update[tid] = reqid; if (onfinish) { - onfinish->finish(0); - delete onfinish; + onfinish->complete(0); } } else if (prepared_update.count(tid)) { diff --git a/src/mds/Mutation.h b/src/mds/Mutation.h index c0bea19d16e..b50a03cefa4 100644 --- a/src/mds/Mutation.h +++ b/src/mds/Mutation.h @@ -330,8 +330,7 @@ struct MDSlaveUpdate { ~MDSlaveUpdate() { item.remove_myself(); if (waiter) - waiter->finish(0); - delete waiter; + waiter->complete(0); } }; diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc index 7e484e8db6b..f537c915945 100644 --- a/src/mon/Monitor.cc +++ b/src/mon/Monitor.cc @@ -605,13 +605,13 @@ void Monitor::shutdown() finish_contexts(g_ceph_context, waitfor_quorum, -ECANCELED); finish_contexts(g_ceph_context, maybe_wait_for_quorum, -ECANCELED); - timer.shutdown(); + remove_all_sessions(); + // unlock before msgr shutdown... lock.Unlock(); - remove_all_sessions(); messenger->shutdown(); // last thing! ceph_mon.cc will delete mon. } diff --git a/src/mon/Paxos.cc b/src/mon/Paxos.cc index ee2ba3b6fdb..508669deef5 100644 --- a/src/mon/Paxos.cc +++ b/src/mon/Paxos.cc @@ -103,11 +103,21 @@ void Paxos::collect(version_t oldpn) // look for uncommitted value if (get_store()->exists(get_name(), last_committed+1)) { + version_t v = get_store()->get(get_name(), "pending_v"); + version_t pn = get_store()->get(get_name(), "pending_pn"); + if (v && pn && v == last_committed + 1) { + uncommitted_pn = pn; + } else { + dout(10) << "WARNING: no pending_pn on disk, using previous accepted_pn " << accepted_pn + << " and crossing our fingers" << dendl; + uncommitted_pn = accepted_pn; + } uncommitted_v = last_committed+1; - uncommitted_pn = accepted_pn; + get_store()->get(get_name(), last_committed+1, uncommitted_value); assert(uncommitted_value.length()); dout(10) << "learned uncommitted " << (last_committed+1) + << " pn " << uncommitted_pn << " (" << uncommitted_value.length() << " bytes) from myself" << dendl; } @@ -164,6 +174,8 @@ void Paxos::handle_collect(MMonPaxos *collect) last->last_committed = last_committed; last->first_committed = first_committed; + version_t previous_pn = accepted_pn; + // can we accept this pn? if (collect->pn > accepted_pn) { // ok, accept it @@ -198,13 +210,25 @@ void Paxos::handle_collect(MMonPaxos *collect) // do we have an accepted but uncommitted value? // (it'll be at last_committed+1) bufferlist bl; - if (get_store()->exists(get_name(), last_committed+1)) { + if (collect->last_committed == last_committed && + get_store()->exists(get_name(), last_committed+1)) { get_store()->get(get_name(), last_committed+1, bl); assert(bl.length() > 0); dout(10) << " sharing our accepted but uncommitted value for " << last_committed+1 << " (" << bl.length() << " bytes)" << dendl; last->values[last_committed+1] = bl; - last->uncommitted_pn = accepted_pn; + + version_t v = get_store()->get(get_name(), "pending_v"); + version_t pn = get_store()->get(get_name(), "pending_pn"); + if (v && pn && v == last_committed + 1) { + last->uncommitted_pn = pn; + } else { + // previously we didn't record which pn a value was accepted + // under! use the pn value we just had... :( + dout(10) << "WARNING: no pending_pn on disk, using previous accepted_pn " << previous_pn + << " and crossing our fingers" << dendl; + last->uncommitted_pn = previous_pn; + } } // send reply @@ -370,9 +394,13 @@ void Paxos::handle_last(MMonPaxos *last) return; } + assert(g_conf->paxos_kill_at != 1); + // store any committed values if any are specified in the message store_state(last); + assert(g_conf->paxos_kill_at != 2); + // do they accept your pn? if (last->pn > accepted_pn) { // no, try again. @@ -390,15 +418,23 @@ void Paxos::handle_last(MMonPaxos *last) << num_last << " peons" << dendl; // did this person send back an accepted but uncommitted value? - if (last->uncommitted_pn && - last->uncommitted_pn > uncommitted_pn) { - uncommitted_v = last->last_committed+1; - uncommitted_pn = last->uncommitted_pn; - uncommitted_value = last->values[uncommitted_v]; - dout(10) << "we learned an uncommitted value for " << uncommitted_v - << " pn " << uncommitted_pn - << " " << uncommitted_value.length() << " bytes" - << dendl; + if (last->uncommitted_pn) { + if (last->uncommitted_pn > uncommitted_pn && + last->last_committed >= last_committed && + last->last_committed + 1 >= uncommitted_v) { + uncommitted_v = last->last_committed+1; + uncommitted_pn = last->uncommitted_pn; + uncommitted_value = last->values[uncommitted_v]; + dout(10) << "we learned an uncommitted value for " << uncommitted_v + << " pn " << uncommitted_pn + << " " << uncommitted_value.length() << " bytes" + << dendl; + } else { + dout(10) << "ignoring uncommitted value for " << (last->last_committed+1) + << " pn " << last->uncommitted_pn + << " " << last->values[last->last_committed+1].length() << " bytes" + << dendl; + } } // is that everyone? @@ -502,6 +538,10 @@ void Paxos::begin(bufferlist& v) MonitorDBStore::Transaction t; t.put(get_name(), last_committed+1, new_value); + // note which pn this pending value is for. + t.put(get_name(), "pending_v", last_committed + 1); + t.put(get_name(), "pending_pn", accepted_pn); + dout(30) << __func__ << " transaction dump:\n"; JSONFormatter f(true); t.dump(&f); @@ -516,6 +556,8 @@ void Paxos::begin(bufferlist& v) get_store()->apply_transaction(t); + assert(g_conf->paxos_kill_at != 3); + if (mon->get_quorum().size() == 1) { // we're alone, take it easy commit(); @@ -566,6 +608,8 @@ void Paxos::handle_begin(MMonPaxos *begin) assert(begin->pn == accepted_pn); assert(begin->last_committed == last_committed); + assert(g_conf->paxos_kill_at != 4); + // set state. state = STATE_UPDATING; lease_expire = utime_t(); // cancel lease @@ -578,6 +622,10 @@ void Paxos::handle_begin(MMonPaxos *begin) MonitorDBStore::Transaction t; t.put(get_name(), v, begin->values[v]); + // note which pn this pending value is for. + t.put(get_name(), "pending_v", v); + t.put(get_name(), "pending_pn", accepted_pn); + dout(30) << __func__ << " transaction dump:\n"; JSONFormatter f(true); t.dump(&f); @@ -586,6 +634,8 @@ void Paxos::handle_begin(MMonPaxos *begin) get_store()->apply_transaction(t); + assert(g_conf->paxos_kill_at != 5); + // reply MMonPaxos *accept = new MMonPaxos(mon->get_epoch(), MMonPaxos::OP_ACCEPT, ceph_clock_now(g_ceph_context)); @@ -620,6 +670,8 @@ void Paxos::handle_accept(MMonPaxos *accept) accepted.insert(from); dout(10) << " now " << accepted << " have accepted" << dendl; + assert(g_conf->paxos_kill_at != 6); + // new majority? if (accepted.size() == (unsigned)mon->monmap->size()/2+1) { // yay, commit! @@ -643,6 +695,8 @@ void Paxos::handle_accept(MMonPaxos *accept) // yay! extend_lease(); + assert(g_conf->paxos_kill_at != 10); + finish_round(); // wake people up @@ -673,6 +727,8 @@ void Paxos::commit() // leader still got a majority and committed with out us.) lease_expire = utime_t(); // cancel lease + assert(g_conf->paxos_kill_at != 7); + MonitorDBStore::Transaction t; // commit locally @@ -692,6 +748,8 @@ void Paxos::commit() get_store()->apply_transaction(t); + assert(g_conf->paxos_kill_at != 8); + // refresh first_committed; this txn may have trimmed. first_committed = get_store()->get(get_name(), "first_committed"); @@ -713,6 +771,8 @@ void Paxos::commit() mon->messenger->send_message(commit, mon->monmap->get_inst(*p)); } + assert(g_conf->paxos_kill_at != 9); + // get ready for a new round. new_value.clear(); diff --git a/src/mon/Paxos.h b/src/mon/Paxos.h index cab27f289a8..69419e64ab9 100644 --- a/src/mon/Paxos.h +++ b/src/mon/Paxos.h @@ -290,8 +290,9 @@ private: */ version_t accepted_pn; /** - * @todo This has something to do with the last_committed version. Not sure - * about what it entails, tbh. + * The last_committed epoch of the leader at the time we accepted the last pn. + * + * This has NO SEMANTIC MEANING, and is there only for the debug output. */ version_t accepted_pn_from; /** @@ -1114,7 +1115,7 @@ public: * @param t The transaction to which we will append the operations * @param bl A bufferlist containing an encoded transaction */ - void decode_append_transaction(MonitorDBStore::Transaction& t, + static void decode_append_transaction(MonitorDBStore::Transaction& t, bufferlist& bl) { MonitorDBStore::Transaction vt; bufferlist::iterator it = bl.begin(); diff --git a/src/os/FDCache.h b/src/os/FDCache.h index cf07f860aa5..f0f40e7bbf4 100644 --- a/src/os/FDCache.h +++ b/src/os/FDCache.h @@ -28,6 +28,7 @@ * FD Cache */ class FDCache : public md_config_obs_t { +public: /** * FD * @@ -47,8 +48,10 @@ class FDCache : public md_config_obs_t { } }; +private: SharedLRU<hobject_t, FD> registry; CephContext *cct; + public: FDCache(CephContext *cct) : cct(cct) { assert(cct); diff --git a/src/os/FileStore.cc b/src/os/FileStore.cc index 10f2b1f2aad..17105c11d69 100644 --- a/src/os/FileStore.cc +++ b/src/os/FileStore.cc @@ -220,7 +220,8 @@ int FileStore::lfn_open(coll_t cid, r = get_index(cid, index); } Mutex::Locker l(fdcache_lock); - *outfd = fdcache.lookup(oid); + if (!replaying) + *outfd = fdcache.lookup(oid); if (*outfd) { return 0; } @@ -258,7 +259,10 @@ int FileStore::lfn_open(coll_t cid, goto fail; } } - *outfd = fdcache.add(oid, fd); + if (!replaying) + *outfd = fdcache.add(oid, fd); + else + *outfd = FDRef(new FDCache::FD(fd)); return 0; fail: @@ -3060,7 +3064,8 @@ int FileStore::_write(coll_t cid, const hobject_t& oid, r = bl.length(); // flush? - wbthrottle.queue_wb(fd, oid, offset, len, replica); + if (!replaying) + wbthrottle.queue_wb(fd, oid, offset, len, replica); lfn_close(fd); out: diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 464ed770df2..3f226cec95d 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -4739,11 +4739,12 @@ bool OSDService::prepare_to_stop() if (state != NOT_STOPPING) return false; - if (get_osdmap()->is_up(whoami)) { + OSDMapRef osdmap = get_osdmap(); + if (osdmap && osdmap->is_up(whoami)) { state = PREPARING_TO_STOP; monc->send_mon_message(new MOSDMarkMeDown(monc->get_fsid(), - get_osdmap()->get_inst(whoami), - get_osdmap()->get_epoch(), + osdmap->get_inst(whoami), + osdmap->get_epoch(), false )); utime_t now = ceph_clock_now(g_ceph_context); diff --git a/src/osd/PGLog.cc b/src/osd/PGLog.cc index 6ba08362dad..dac1f33fd91 100644 --- a/src/osd/PGLog.cc +++ b/src/osd/PGLog.cc @@ -375,7 +375,6 @@ void PGLog::rewind_divergent_log(ObjectStore::Transaction& t, eversion_t newhead } assert(p->version > newhead); dout(10) << "rewind_divergent_log future divergent " << *p << dendl; - log.unindex(*p); } log.head = newhead; @@ -383,6 +382,7 @@ void PGLog::rewind_divergent_log(ObjectStore::Transaction& t, eversion_t newhead if (info.last_complete > newhead) info.last_complete = newhead; + log.index(); for (list<pg_log_entry_t>::iterator d = divergent.begin(); d != divergent.end(); ++d) merge_old_entry(t, *d, info, remove_snap); @@ -505,7 +505,6 @@ void PGLog::merge_log(ObjectStore::Transaction& t, break; dout(10) << "merge_log divergent " << oe << dendl; divergent.push_front(oe); - log.unindex(oe); log.log.pop_back(); } diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc index 9c8d42dbf3c..298d38d6ace 100644 --- a/src/osd/ReplicatedPG.cc +++ b/src/osd/ReplicatedPG.cc @@ -7767,6 +7767,14 @@ void ReplicatedPG::_scrub_finish() #undef dout_prefix #define dout_prefix *_dout << pg->gen_prefix() +ReplicatedPG::SnapTrimmer::~SnapTrimmer() +{ + while (!repops.empty()) { + (*repops.begin())->put(); + repops.erase(repops.begin()); + } +} + void ReplicatedPG::SnapTrimmer::log_enter(const char *state_name) { dout(20) << "enter " << state_name << dendl; diff --git a/src/osd/ReplicatedPG.h b/src/osd/ReplicatedPG.h index 0d4867f6e6d..9dafe23faa1 100644 --- a/src/osd/ReplicatedPG.h +++ b/src/osd/ReplicatedPG.h @@ -982,6 +982,7 @@ private: bool need_share_pg_info; bool requeue; SnapTrimmer(ReplicatedPG *pg) : pg(pg), need_share_pg_info(false), requeue(false) {} + ~SnapTrimmer(); void log_enter(const char *state_name); void log_exit(const char *state_name, utime_t duration); } snap_trimmer_machine; diff --git a/src/osdc/Filer.cc b/src/osdc/Filer.cc index 7eb4ad616db..8f94a97d292 100644 --- a/src/osdc/Filer.cc +++ b/src/osdc/Filer.cc @@ -131,8 +131,7 @@ void Filer::_probed(Probe *probe, const object_t& oid, uint64_t size, utime_t mt return; // waiting for more! if (probe->err) { // we hit an error, propagate back up - probe->onfinish->finish(probe->err); - delete probe->onfinish; + probe->onfinish->complete(probe->err); delete probe; return; } @@ -216,8 +215,7 @@ void Filer::_probed(Probe *probe, const object_t& oid, uint64_t size, utime_t mt } // done! finish and clean up. - probe->onfinish->finish(probe->err); - delete probe->onfinish; + probe->onfinish->complete(probe->err); delete probe; } @@ -285,8 +283,7 @@ void Filer::_do_purge_range(PurgeRange *pr, int fin) << " uncommitted " << pr->uncommitted << dendl; if (pr->num == 0 && pr->uncommitted == 0) { - pr->oncommit->finish(0); - delete pr->oncommit; + pr->oncommit->complete(0); delete pr; return; } diff --git a/src/osdc/Journaler.cc b/src/osdc/Journaler.cc index cd9b9edc4c7..ba4ca8dc4b9 100644 --- a/src/osdc/Journaler.cc +++ b/src/osdc/Journaler.cc @@ -181,8 +181,7 @@ void Journaler::_finish_reread_head(int r, bufferlist& bl, Context *finish) trimmed_pos = trimming_pos = h.trimmed_pos; init_headers(h); state = STATE_ACTIVE; - finish->finish(r); - delete finish; + finish->complete(r); } void Journaler::_finish_read_head(int r, bufferlist& bl) @@ -261,8 +260,7 @@ void Journaler::_finish_reprobe(int r, uint64_t new_end, Context *onfinish) { << dendl; prezeroing_pos = prezero_pos = write_pos = flush_pos = safe_pos = new_end; state = STATE_ACTIVE; - onfinish->finish(r); - delete onfinish; + onfinish->complete(r); } void Journaler::_finish_probe_end(int r, uint64_t end) @@ -367,8 +365,7 @@ void Journaler::_finish_write_head(int r, Header &wrote, Context *oncommit) ldout(cct, 10) << "_finish_write_head " << wrote << dendl; last_committed = wrote; if (oncommit) { - oncommit->finish(r); - delete oncommit; + oncommit->complete(r); } trim(); // trim? @@ -563,8 +560,7 @@ void Journaler::wait_for_flush(Context *onsafe) ldout(cct, 10) << "flush nothing to flush, (prezeroing/prezero)/write/flush/safe pointers at " << "(" << prezeroing_pos << "/" << prezero_pos << ")/" << write_pos << "/" << flush_pos << "/" << safe_pos << dendl; if (onsafe) { - onsafe->finish(0); - delete onsafe; + onsafe->complete(0); onsafe = 0; } return; @@ -584,8 +580,7 @@ void Journaler::flush(Context *onsafe) ldout(cct, 10) << "flush nothing to flush, (prezeroing/prezero)/write/flush/safe pointers at " << "(" << prezeroing_pos << "/" << prezero_pos << ")/" << write_pos << "/" << flush_pos << "/" << safe_pos << dendl; if (onsafe) { - onsafe->finish(0); - delete onsafe; + onsafe->complete(0); } } else { if (1) { @@ -731,8 +726,7 @@ void Journaler::_finish_read(int r, uint64_t offset, bufferlist& bl) if (on_readable) { Context *f = on_readable; on_readable = 0; - f->finish(r); - delete f; + f->complete(r); } return; } @@ -779,8 +773,7 @@ void Journaler::_assimilate_prefetch() if (on_readable) { Context *f = on_readable; on_readable = 0; - f->finish(0); - delete f; + f->complete(0); } } } @@ -1060,8 +1053,7 @@ void Journaler::handle_write_error(int r) { lderr(cct) << "handle_write_error " << cpp_strerror(r) << dendl; if (on_write_error) { - on_write_error->finish(r); - delete on_write_error; + on_write_error->complete(r); on_write_error = NULL; } else { assert(0 == "unhandled write error"); diff --git a/src/osdc/Objecter.cc b/src/osdc/Objecter.cc index a5a023cb33e..9933f853f8f 100644 --- a/src/osdc/Objecter.cc +++ b/src/osdc/Objecter.cc @@ -321,8 +321,7 @@ void Objecter::_linger_ack(LingerOp *info, int r) { ldout(cct, 10) << "_linger_ack " << info->linger_id << dendl; if (info->on_reg_ack) { - info->on_reg_ack->finish(r); - delete info->on_reg_ack; + info->on_reg_ack->complete(r); info->on_reg_ack = NULL; } } @@ -331,8 +330,7 @@ void Objecter::_linger_commit(LingerOp *info, int r) { ldout(cct, 10) << "_linger_commit " << info->linger_id << dendl; if (info->on_reg_commit) { - info->on_reg_commit->finish(r); - delete info->on_reg_commit; + info->on_reg_commit->complete(r); info->on_reg_commit = NULL; } @@ -676,8 +674,7 @@ void Objecter::handle_osd_map(MOSDMap *m) //go through the list and call the onfinish methods for (list<pair<Context*, int> >::iterator i = p->second.begin(); i != p->second.end(); ++i) { - i->first->finish(i->second); - delete i->first; + i->first->complete(i->second); } waiting_for_map.erase(p++); } @@ -1622,12 +1619,10 @@ void Objecter::handle_osd_op_reply(MOSDOpReply *m) // do callbacks if (onack) { - onack->finish(rc); - delete onack; + onack->complete(rc); } if (oncommit) { - oncommit->finish(rc); - delete oncommit; + oncommit->complete(rc); } m->put(); @@ -1646,8 +1641,7 @@ void Objecter::list_objects(ListContext *list_context, Context *onfinish) { << "\nlist_context->cookie" << list_context->cookie << dendl; if (list_context->at_end) { - onfinish->finish(0); - delete onfinish; + onfinish->complete(0); return; } @@ -1667,8 +1661,7 @@ void Objecter::list_objects(ListContext *list_context, Context *onfinish) { list_context->starting_pg_num = pg_num; } if (list_context->current_pg == pg_num){ //this context got all the way through - onfinish->finish(0); - delete onfinish; + onfinish->complete(0); return; } @@ -1722,9 +1715,8 @@ void Objecter::_list_reply(ListContext *list_context, int r, bufferlist *bl, ldout(cct, 20) << "got a response with objects, proceeding" << dendl; list_context->list.merge(response.entries); if (response_size >= list_context->max_entries) { - final_finish->finish(0); + final_finish->complete(0); delete bl; - delete final_finish; return; } @@ -1756,8 +1748,7 @@ void Objecter::_list_reply(ListContext *list_context, int r, bufferlist *bl, ldout(cct, 20) << "out of pgs, returning to" << final_finish << dendl; list_context->at_end = true; delete bl; - final_finish->finish(0); - delete final_finish; + final_finish->complete(0); return; } @@ -1799,8 +1790,7 @@ struct C_SelfmanagedSnap : public Context { bufferlist::iterator p = bl.begin(); ::decode(*psnapid, p); } - fin->finish(r); - delete fin; + fin->complete(r); } }; @@ -1975,8 +1965,7 @@ void Objecter::handle_pool_op_reply(MPoolOpReply *m) wait_for_new_map(op->onfinish, m->epoch, m->replyCode); } else { - op->onfinish->finish(m->replyCode); - delete op->onfinish; + op->onfinish->complete(m->replyCode); } op->onfinish = NULL; delete op; @@ -2033,8 +2022,7 @@ void Objecter::handle_get_pool_stats_reply(MGetPoolStatsReply *m) *op->pool_stats = m->pool_stats; if (m->version > last_seen_pgmap_version) last_seen_pgmap_version = m->version; - op->onfinish->finish(0); - delete op->onfinish; + op->onfinish->complete(0); poolstat_ops.erase(tid); delete op; @@ -2085,8 +2073,7 @@ void Objecter::handle_fs_stats_reply(MStatfsReply *m) *(op->stats) = m->h.st; if (m->h.version > last_seen_pgmap_version) last_seen_pgmap_version = m->h.version; - op->onfinish->finish(0); - delete op->onfinish; + op->onfinish->complete(0); statfs_ops.erase(tid); delete op; @@ -2128,8 +2115,7 @@ void Objecter::_sg_read_finish(vector<ObjectExtent>& extents, vector<bufferlist> ldout(cct, 7) << "_sg_read_finish " << bytes_read << " bytes" << dendl; if (onfinish) { - onfinish->finish(bytes_read);// > 0 ? bytes_read:m->get_result()); - delete onfinish; + onfinish->complete(bytes_read);// > 0 ? bytes_read:m->get_result()); } } diff --git a/src/osdc/Objecter.h b/src/osdc/Objecter.h index c1cac88b60e..b593bef69d9 100644 --- a/src/osdc/Objecter.h +++ b/src/osdc/Objecter.h @@ -860,8 +860,7 @@ public: if (pmtime) *pmtime = m; } - fin->finish(r); - delete fin; + fin->complete(r); } }; @@ -875,8 +874,7 @@ public: bufferlist::iterator p = bl.begin(); ::decode(attrset, p); } - fin->finish(r); - delete fin; + fin->complete(r); } }; @@ -916,8 +914,7 @@ public: if (r >= 0) { objecter->_list_reply(list_context, r, bl, final_finish, epoch); } else { - final_finish->finish(r); - delete final_finish; + final_finish->complete(r); } } }; diff --git a/src/pybind/ceph_argparse.py b/src/pybind/ceph_argparse.py index 72b36dd50a5..73d1115f645 100644 --- a/src/pybind/ceph_argparse.py +++ b/src/pybind/ceph_argparse.py @@ -321,18 +321,20 @@ class CephName(CephArgtype): Also accept '*' """ + def __init__(self): + self.nametype = None + self.nameid = None + def valid(self, s, partial=False): if s == '*': self.val = s - self.nametype = None - self.nameid = None return if s.find('.') == -1: raise ArgumentFormat('CephName: no . in {0}'.format(s)) else: t, i = s.split('.') if not t in ('osd', 'mon', 'client', 'mds'): - raise ArgumentValid('unknown type ' + self.t) + raise ArgumentValid('unknown type ' + t) if t == 'osd': if i != '*': try: @@ -352,19 +354,21 @@ class CephOsdName(CephArgtype): osd.<id>, or <id>, or *, where id is a base10 int """ + def __init__(self): + self.nametype = None + self.nameid = None + def valid(self, s, partial=False): if s == '*': self.val = s - self.nametype = None - self.nameid = None return if s.find('.') != -1: t, i = s.split('.') + if t != 'osd': + raise ArgumentValid('unknown type ' + t) else: t = 'osd' i = s - if t != 'osd': - raise ArgumentValid('unknown type ' + self.t) try: i = int(i) except: @@ -381,7 +385,7 @@ class CephChoices(CephArgtype): Set of string literals; init with valid choices """ def __init__(self, strings='', **kwargs): - self.strings=strings.split('|') + self.strings = strings.split('|') def valid(self, s, partial=False): if not partial: @@ -523,16 +527,16 @@ class argdesc(object): def __repr__(self): r = 'argdesc(' + str(self.t) + ', ' internals = ['N', 'typeargs', 'instance', 't'] - for (k,v) in self.__dict__.iteritems(): + for (k, v) in self.__dict__.iteritems(): if k.startswith('__') or k in internals: pass else: # undo modification from __init__ if k == 'n' and self.N: v = 'N' - r += '{0}={1}, '.format(k,v) - for (k,v) in self.typeargs.iteritems(): - r += '{0}={1}, '.format(k,v) + r += '{0}={1}, '.format(k, v) + for (k, v) in self.typeargs.iteritems(): + r += '{0}={1}, '.format(k, v) return r[:-2] + ')' def __str__(self): @@ -698,7 +702,7 @@ def matchnum(args, signature, partial=False): while desc.numseen < desc.n: # if there are no more arguments, return if not words: - return matchcnt; + return matchcnt word = words.pop(0) try: @@ -887,7 +891,7 @@ def validate_command(parsed_args, sigdict, args, verbose=False): return valid_dict -def send_command(cluster, target=('mon', ''), cmd=[], inbuf='', timeout=0, +def send_command(cluster, target=('mon', ''), cmd=None, inbuf='', timeout=0, verbose=False): """ Send a command to a daemon using librados's @@ -900,6 +904,7 @@ def send_command(cluster, target=('mon', ''), cmd=[], inbuf='', timeout=0, If target is osd.N, send command to that osd (except for pgid cmds) """ + cmd = cmd or [] try: if target[0] == 'osd': osdid = target[1] diff --git a/src/rgw/rgw_auth_s3.cc b/src/rgw/rgw_auth_s3.cc index bdd458e68b6..c93de7cd58a 100644 --- a/src/rgw/rgw_auth_s3.cc +++ b/src/rgw/rgw_auth_s3.cc @@ -190,8 +190,14 @@ bool rgw_create_s3_canonical_header(req_info& info, utime_t *header_time, string map<string, string>& meta_map = info.x_meta_map; map<string, string>& sub_resources = info.args.get_sub_resources(); + string request_uri; + if (info.effective_uri.empty()) + request_uri = info.request_uri; + else + request_uri = info.effective_uri; + rgw_create_s3_canonical_header(info.method, content_md5, content_type, date.c_str(), - meta_map, info.request_uri.c_str(), sub_resources, + meta_map, request_uri.c_str(), sub_resources, dest); return true; diff --git a/src/rgw/rgw_common.cc b/src/rgw/rgw_common.cc index aea396bf3de..8a281775d07 100644 --- a/src/rgw/rgw_common.cc +++ b/src/rgw/rgw_common.cc @@ -109,7 +109,12 @@ void req_info::rebuild_from(req_info& src) { method = src.method; script_uri = src.script_uri; - request_uri = src.request_uri; + if (src.effective_uri.empty()) { + request_uri = src.request_uri; + } else { + request_uri = src.effective_uri; + } + effective_uri.clear(); host = src.host; x_meta_map = src.x_meta_map; diff --git a/src/rgw/rgw_common.h b/src/rgw/rgw_common.h index 1d3596d4418..7f224a798f5 100644 --- a/src/rgw/rgw_common.h +++ b/src/rgw/rgw_common.h @@ -764,6 +764,7 @@ struct req_info { const char *method; string script_uri; string request_uri; + string effective_uri; string request_params; req_info(CephContext *cct, RGWEnv *_env); @@ -780,7 +781,7 @@ struct req_state { int format; ceph::Formatter *formatter; string decoded_uri; - string effective_uri; + string relative_uri; const char *length; uint64_t content_length; map<string, string> generic_attrs; diff --git a/src/rgw/rgw_op.cc b/src/rgw/rgw_op.cc index 45477486ccc..7760a2f5c52 100644 --- a/src/rgw/rgw_op.cc +++ b/src/rgw/rgw_op.cc @@ -1654,6 +1654,25 @@ int RGWCopyObj::init_common() return 0; } +static void copy_obj_progress_cb(off_t ofs, void *param) +{ + RGWCopyObj *op = static_cast<RGWCopyObj *>(param); + op->progress_cb(ofs); +} + +void RGWCopyObj::progress_cb(off_t ofs) +{ + if (!s->cct->_conf->rgw_copy_obj_progress) + return; + + if (ofs - last_ofs < s->cct->_conf->rgw_copy_obj_progress_every_bytes) + return; + + send_partial_response(ofs); + + last_ofs = ofs; +} + void RGWCopyObj::execute() { rgw_obj src_obj, dst_obj; @@ -1685,7 +1704,9 @@ void RGWCopyObj::execute() replace_attrs, attrs, RGW_OBJ_CATEGORY_MAIN, &s->req_id, /* use req_id as tag */ - &s->err); + &s->err, + copy_obj_progress_cb, (void *)this + ); } int RGWGetACLs::verify_permission() diff --git a/src/rgw/rgw_op.h b/src/rgw/rgw_op.h index e107b90a155..5da2e4f472c 100644 --- a/src/rgw/rgw_op.h +++ b/src/rgw/rgw_op.h @@ -438,6 +438,8 @@ protected: string client_id; string op_id; + off_t last_ofs; + int init_common(); @@ -460,6 +462,7 @@ public: ret = 0; mtime = 0; replace_attrs = false; + last_ofs = 0; } virtual void init(RGWRados *store, struct req_state *s, RGWHandler *h) { @@ -468,9 +471,11 @@ public: } int verify_permission(); void execute(); + void progress_cb(off_t ofs); virtual int init_dest_policy() { return 0; } virtual int get_params() = 0; + virtual void send_partial_response(off_t ofs) {} virtual void send_response() = 0; virtual const string name() { return "copy_obj"; } virtual uint32_t op_mask() { return RGW_OP_TYPE_WRITE; } diff --git a/src/rgw/rgw_rados.cc b/src/rgw/rgw_rados.cc index 0c7b22a42d3..8af03b03a8f 100644 --- a/src/rgw/rgw_rados.cc +++ b/src/rgw/rgw_rados.cc @@ -2397,9 +2397,16 @@ class RGWRadosPutObj : public RGWGetDataCB rgw_obj obj; RGWPutObjProcessor_Atomic *processor; RGWOpStateSingleOp *opstate; + void (*progress_cb)(off_t, void *); + void *progress_data; public: - RGWRadosPutObj(RGWPutObjProcessor_Atomic *p, RGWOpStateSingleOp *_ops) : processor(p), opstate(_ops) {} + RGWRadosPutObj(RGWPutObjProcessor_Atomic *p, RGWOpStateSingleOp *_ops, + void (*_progress_cb)(off_t, void *), void *_progress_data) : processor(p), opstate(_ops), + progress_cb(_progress_cb), + progress_data(_progress_data) {} int handle_data(bufferlist& bl, off_t ofs, off_t len) { + progress_cb(ofs, progress_data); + void *handle; int ret = processor->handle_data(bl, ofs, &handle); if (ret < 0) @@ -2477,7 +2484,9 @@ int RGWRados::copy_obj(void *ctx, map<string, bufferlist>& attrs, RGWObjCategory category, string *ptag, - struct rgw_err *err) + struct rgw_err *err, + void (*progress_cb)(off_t, void *), + void *progress_data) { int ret; uint64_t total_len, obj_size; @@ -2545,7 +2554,7 @@ int RGWRados::copy_obj(void *ctx, ldout(cct, 0) << "ERROR: failed to set opstate ret=" << ret << dendl; return ret; } - RGWRadosPutObj cb(&processor, &opstate); + RGWRadosPutObj cb(&processor, &opstate, progress_cb, progress_data); string etag; map<string, string> req_headers; time_t set_mtime; diff --git a/src/rgw/rgw_rados.h b/src/rgw/rgw_rados.h index c9924e0dc56..bcc40900299 100644 --- a/src/rgw/rgw_rados.h +++ b/src/rgw/rgw_rados.h @@ -1121,7 +1121,9 @@ public: map<std::string, bufferlist>& attrs, RGWObjCategory category, string *ptag, - struct rgw_err *err); + struct rgw_err *err, + void (*progress_cb)(off_t, void *), + void *progress_data); int copy_obj_data(void *ctx, void *handle, off_t end, diff --git a/src/rgw/rgw_rest.cc b/src/rgw/rgw_rest.cc index 0f9e61d1740..e4933a67a39 100644 --- a/src/rgw/rgw_rest.cc +++ b/src/rgw/rgw_rest.cc @@ -1242,7 +1242,7 @@ RGWHandler *RGWREST::get_handler(RGWRados *store, struct req_state *s, RGWClient if (*init_error < 0) return NULL; - RGWRESTMgr *m = mgr.get_resource_mgr(s, s->decoded_uri, &s->effective_uri); + RGWRESTMgr *m = mgr.get_resource_mgr(s, s->decoded_uri, &s->relative_uri); if (!m) { *init_error = -ERR_METHOD_NOT_ALLOWED; return NULL; diff --git a/src/rgw/rgw_rest_client.cc b/src/rgw/rgw_rest_client.cc index 2075e535525..ea80b5b84f8 100644 --- a/src/rgw/rgw_rest_client.cc +++ b/src/rgw/rgw_rest_client.cc @@ -403,6 +403,7 @@ int RGWRESTStreamWriteRequest::put_obj_init(RGWAccessKey& key, rgw_obj& obj, uin new_info.script_uri = "/"; new_info.script_uri.append(resource); new_info.request_uri = new_info.script_uri; + new_info.effective_uri = new_info.effective_uri; map<string, string>& m = new_env.get_map(); map<string, bufferlist>::iterator bliter; @@ -568,6 +569,7 @@ int RGWRESTStreamReadRequest::get_obj(RGWAccessKey& key, map<string, string>& ex new_info.script_uri = "/"; new_info.script_uri.append(resource); new_info.request_uri = new_info.script_uri; + new_info.effective_uri = new_info.effective_uri; new_info.init_meta_info(NULL); diff --git a/src/rgw/rgw_rest_s3.cc b/src/rgw/rgw_rest_s3.cc index 66f6652ec6a..6c1738218e6 100644 --- a/src/rgw/rgw_rest_s3.cc +++ b/src/rgw/rgw_rest_s3.cc @@ -1300,15 +1300,33 @@ int RGWCopyObj_ObjStore_S3::get_params() return 0; } -void RGWCopyObj_ObjStore_S3::send_response() +void RGWCopyObj_ObjStore_S3::send_partial_response(off_t ofs) { - if (ret) + if (!sent_header) { + if (ret) set_req_state_err(s, ret); - dump_errno(s); + dump_errno(s); + + end_header(s, "binary/octet-stream"); + if (ret == 0) { + s->formatter->open_object_section("CopyObjectResult"); + } + sent_header = true; + } else { + /* Send progress field. Note that this diverge from the original S3 + * spec. We do this in order to keep connection alive. + */ + s->formatter->dump_int("Progress", (uint64_t)ofs); + } + rgw_flush_formatter(s, s->formatter); +} + +void RGWCopyObj_ObjStore_S3::send_response() +{ + if (!sent_header) + send_partial_response(0); - end_header(s, "binary/octet-stream"); if (ret == 0) { - s->formatter->open_object_section("CopyObjectResult"); dump_time(s, "LastModified", &mtime); map<string, bufferlist>::iterator iter = attrs.find(RGW_ATTR_ETAG); if (iter != attrs.end()) { @@ -1801,7 +1819,7 @@ int RGWHandler_ObjStore_S3::init_from_header(struct req_state *s, int default_fo string req; string first; - const char *req_name = s->effective_uri.c_str(); + const char *req_name = s->relative_uri.c_str(); const char *p; if (*req_name == '?') { diff --git a/src/rgw/rgw_rest_s3.h b/src/rgw/rgw_rest_s3.h index e2a1b0b92eb..a0af4eac9fd 100644 --- a/src/rgw/rgw_rest_s3.h +++ b/src/rgw/rgw_rest_s3.h @@ -143,12 +143,14 @@ public: }; class RGWCopyObj_ObjStore_S3 : public RGWCopyObj_ObjStore { + bool sent_header; public: - RGWCopyObj_ObjStore_S3() {} + RGWCopyObj_ObjStore_S3() : sent_header(false) {} ~RGWCopyObj_ObjStore_S3() {} int init_dest_policy(); int get_params(); + void send_partial_response(off_t ofs); void send_response(); }; diff --git a/src/rgw/rgw_rest_swift.cc b/src/rgw/rgw_rest_swift.cc index 157158e7ed7..b4f830830f9 100644 --- a/src/rgw/rgw_rest_swift.cc +++ b/src/rgw/rgw_rest_swift.cc @@ -288,6 +288,8 @@ int RGWCreateBucket_ObjStore_SWIFT::get_params() { policy.create_default(s->user.user_id, s->user.display_name); + location_constraint = store->region.api_name; + return 0; } @@ -475,13 +477,40 @@ int RGWCopyObj_ObjStore_SWIFT::get_params() return 0; } +void RGWCopyObj_ObjStore_SWIFT::send_partial_response(off_t ofs) +{ + if (!sent_header) { + if (!ret) + ret = STATUS_CREATED; + set_req_state_err(s, ret); + dump_errno(s); + end_header(s); + + /* Send progress information. Note that this diverge from the original swift + * spec. We do this in order to keep connection alive. + */ + if (ret == 0) { + s->formatter->open_array_section("progress"); + } + sent_header = true; + } else { + s->formatter->dump_int("ofs", (uint64_t)ofs); + } + rgw_flush_formatter(s, s->formatter); +} + void RGWCopyObj_ObjStore_SWIFT::send_response() { - if (!ret) - ret = STATUS_CREATED; - set_req_state_err(s, ret); - dump_errno(s); - end_header(s); + if (!sent_header) { + if (!ret) + ret = STATUS_CREATED; + set_req_state_err(s, ret); + dump_errno(s); + end_header(s); + } else { + s->formatter->close_section(); + rgw_flush_formatter(s, s->formatter); + } } int RGWGetObj_ObjStore_SWIFT::send_response_data(bufferlist& bl, off_t bl_ofs, off_t bl_len) @@ -829,11 +858,16 @@ int RGWHandler_ObjStore_SWIFT::init_from_header(struct req_state *s) s->bucket_name_str = first; s->bucket_name = strdup(s->bucket_name_str.c_str()); + + s->info.effective_uri = "/" + s->bucket_name_str; + if (req.size()) { s->object_str = req; s->object = strdup(s->object_str.c_str()); + s->info.effective_uri.append("/" + s->object_str); } + return 0; } diff --git a/src/rgw/rgw_rest_swift.h b/src/rgw/rgw_rest_swift.h index e4b6f0bccee..1c23ab29204 100644 --- a/src/rgw/rgw_rest_swift.h +++ b/src/rgw/rgw_rest_swift.h @@ -100,13 +100,15 @@ public: }; class RGWCopyObj_ObjStore_SWIFT : public RGWCopyObj_ObjStore { + bool sent_header; public: - RGWCopyObj_ObjStore_SWIFT() {} + RGWCopyObj_ObjStore_SWIFT() : sent_header(false) {} ~RGWCopyObj_ObjStore_SWIFT() {} int init_dest_policy(); int get_params(); void send_response(); + void send_partial_response(off_t ofs); }; class RGWGetACLs_ObjStore_SWIFT : public RGWGetACLs_ObjStore { diff --git a/src/test/ObjectMap/KeyValueDBMemory.h b/src/test/ObjectMap/KeyValueDBMemory.h index baed9de28e0..93d0809d491 100644 --- a/src/test/ObjectMap/KeyValueDBMemory.h +++ b/src/test/ObjectMap/KeyValueDBMemory.h @@ -104,8 +104,7 @@ public: for (list<Context *>::iterator i = on_commit.begin(); i != on_commit.end(); on_commit.erase(i++)) { - (*i)->finish(0); - delete *i; + (*i)->complete(0); } return 0; } diff --git a/src/test/filestore/workload_generator.h b/src/test/filestore/workload_generator.h index 6a63b353c61..80e95dae6ec 100644 --- a/src/test/filestore/workload_generator.h +++ b/src/test/filestore/workload_generator.h @@ -163,7 +163,7 @@ public: : stat_state(state), ctx(context) { } void finish(int r) { - ctx->finish(r); + ctx->complete(r); stat_state->wrkldgen->m_stats_lock.Lock(); diff --git a/src/test/gather.cc b/src/test/gather.cc index 92bec7650c6..e067ceed8f9 100644 --- a/src/test/gather.cc +++ b/src/test/gather.cc @@ -40,8 +40,7 @@ TEST(ContextGather, OneSub) { C_Checker *checker = new C_Checker(&finish_called, &result); gather.set_finisher(checker); gather.activate(); - sub->finish(0); - delete sub; + sub->complete(0); EXPECT_TRUE(finish_called); EXPECT_EQ(0, result); } @@ -63,14 +62,12 @@ TEST(ContextGather, ManySubs) { //finish all except one sub for (int j = 0; j < sub_count - 1; ++j) { - subs[j]->finish(0); - delete subs[j]; + subs[j]->complete(0); EXPECT_FALSE(finish_called); } //finish last one and check asserts - subs[sub_count-1]->finish(0); - delete subs[sub_count-1]; + subs[sub_count-1]->complete(0); EXPECT_TRUE(finish_called); } @@ -92,16 +89,14 @@ TEST(ContextGather, AlternatingSubCreateFinish) { //alternate finishing first half of subs and creating last half of subs for (int j = 0; j < sub_count / 2; ++j) { - subs[j]->finish(0); - delete subs[j]; + subs[j]->complete(0); subs[sub_count / 2 + j] = gather.new_sub(); } gather.activate(); //finish last half of subs for (int k = sub_count / 2; k < sub_count; ++k) { - subs[k]->finish(0); - delete subs[k]; + subs[k]->complete(0); } EXPECT_TRUE(finish_called); diff --git a/src/test/osd/TestPGLog.cc b/src/test/osd/TestPGLog.cc index d8ec8d03df2..e0863f726a0 100644 --- a/src/test/osd/TestPGLog.cc +++ b/src/test/osd/TestPGLog.cc @@ -82,6 +82,10 @@ TEST_F(PGLogTest, rewind_divergent_log) { hobject_t divergent_object; eversion_t divergent_version; eversion_t newhead; + + hobject_t divergent; + divergent.hash = 0x9; + { pg_log_entry_t e; @@ -90,16 +94,16 @@ TEST_F(PGLogTest, rewind_divergent_log) { log.tail = e.version; log.log.push_back(e); e.version = newhead = eversion_t(1, 4); - e.soid.hash = 0x9; + e.soid = divergent; e.op = pg_log_entry_t::MODIFY; log.log.push_back(e); - log.index(); e.version = divergent_version = eversion_t(1, 5); - e.soid.hash = 0x9; + e.soid = divergent; divergent_object = e.soid; e.op = pg_log_entry_t::DELETE; log.log.push_back(e); log.head = e.version; + log.index(); info.last_update = log.head; info.last_complete = log.head; @@ -118,6 +122,7 @@ TEST_F(PGLogTest, rewind_divergent_log) { rewind_divergent_log(t, newhead, info, remove_snap, dirty_info, dirty_big_info); + EXPECT_TRUE(log.objects.count(divergent)); EXPECT_TRUE(missing.is_missing(divergent_object)); EXPECT_EQ(1U, log.objects.count(divergent_object)); EXPECT_EQ(2U, log.log.size()); diff --git a/src/tools/ceph-monstore-tool.cc b/src/tools/ceph-monstore-tool.cc index ae608a302f2..f361266aff0 100644 --- a/src/tools/ceph-monstore-tool.cc +++ b/src/tools/ceph-monstore-tool.cc @@ -31,6 +31,7 @@ #include "global/global_init.h" #include "os/LevelDBStore.h" #include "mon/MonitorDBStore.h" +#include "mon/Paxos.h" #include "common/Formatter.h" namespace po = boost::program_options; @@ -246,6 +247,19 @@ int main(int argc, char **argv) { goto done; } bl.write_fd(fd); + } else if (cmd == "dump-paxos") { + for (version_t v = dstart; v <= dstop; ++v) { + bufferlist bl; + st.get("paxos", v, bl); + if (bl.length() == 0) + break; + cout << "\n--- " << v << " ---" << std::endl; + MonitorDBStore::Transaction tx; + Paxos::decode_append_transaction(tx, bl); + JSONFormatter f(true); + tx.dump(&f); + f.flush(cout); + } } else if (cmd == "dump-trace") { if (tfile.empty()) { std::cerr << "Need trace_file" << std::endl; |