summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.gitignore8
-rw-r--r--.mailmap84
-rw-r--r--COPYING74
-rw-r--r--PendingReleaseNotes23
-rw-r--r--ceph.spec.in1
-rw-r--r--configure.ac2
-rw-r--r--debian/changelog6
-rw-r--r--debian/control1
-rw-r--r--debian/copyright60
-rw-r--r--doc/architecture.rst2
-rw-r--r--doc/changelog/v0.67.4.txt550
-rw-r--r--doc/dev/osd_internals/erasure_coding.rst26
-rw-r--r--doc/dev/osd_internals/erasure_coding/PGBackend-h.rst156
-rw-r--r--doc/dev/osd_internals/erasure_coding/developer_notes.rst257
-rw-r--r--doc/dev/osd_internals/erasure_coding/jerasure.rst22
-rw-r--r--doc/dev/osd_internals/erasure_coding/pgbackend.rst42
-rw-r--r--doc/index.rst1
-rw-r--r--doc/install/index.rst69
-rw-r--r--doc/install/libvirt-deb.rst43
-rw-r--r--doc/install/libvirt-rpm.rst19
-rw-r--r--doc/install/qemu-deb.rst26
-rw-r--r--doc/install/qemu-rpm.rst56
-rw-r--r--doc/install/rpm.rst165
-rw-r--r--doc/install/yum-priorities.rst20
-rw-r--r--doc/man/8/rbd.rst4
-rw-r--r--doc/rados/configuration/journal-ref.rst6
-rw-r--r--doc/rados/operations/add-or-rm-mons.rst12
-rw-r--r--doc/rados/operations/authentication.rst15
-rw-r--r--doc/rados/operations/operating.rst237
-rw-r--r--doc/rbd/libvirt.rst54
-rw-r--r--doc/rbd/qemu-rbd.rst26
-rw-r--r--doc/rbd/rbd-openstack.rst2
-rw-r--r--doc/release-notes.rst87
-rw-r--r--doc/start/hardware-recommendations.rst (renamed from doc/install/hardware-recommendations.rst)0
-rw-r--r--doc/start/index.rst39
-rw-r--r--doc/start/intro.rst70
-rw-r--r--doc/start/os-recommendations.rst (renamed from doc/install/os-recommendations.rst)33
-rw-r--r--doc/start/quick-ceph-deploy.rst390
-rw-r--r--doc/start/quick-cephfs.rst4
-rw-r--r--doc/start/quick-rbd.rst56
-rw-r--r--doc/start/quick-rgw.rst4
-rw-r--r--doc/start/quick-start-preflight.rst195
-rw-r--r--fusetrace/fusetrace_ll.cc2
-rw-r--r--man/rbd.85
-rw-r--r--qa/run_xfstests.sh3
-rwxr-xr-xqa/workunits/cephtool/test.sh11
-rwxr-xr-xqa/workunits/mon/crush_ops.sh9
-rwxr-xr-xqa/workunits/rados/test_tmap_to_omap.sh28
-rwxr-xr-xqa/workunits/rbd/copy.sh12
-rwxr-xr-xqa/workunits/rbd/import_export.sh8
-rwxr-xr-xqa/workunits/snaps/snap-rm-diff.sh1
-rwxr-xr-xqa/workunits/snaps/snaptest-0.sh12
-rwxr-xr-xqa/workunits/snaps/snaptest-1.sh2
-rwxr-xr-xqa/workunits/snaps/snaptest-2.sh2
-rwxr-xr-xqa/workunits/snaps/snaptest-authwb.sh2
-rwxr-xr-xqa/workunits/snaps/snaptest-capwb.sh2
-rwxr-xr-xqa/workunits/snaps/snaptest-dir-rename.sh2
-rwxr-xr-xqa/workunits/snaps/snaptest-double-null.sh2
-rwxr-xr-xqa/workunits/snaps/snaptest-estale.sh2
-rwxr-xr-xqa/workunits/snaps/snaptest-git-ceph.sh2
-rwxr-xr-xqa/workunits/snaps/snaptest-intodir.sh2
-rwxr-xr-xqa/workunits/snaps/snaptest-multiple-capsnaps.sh2
-rw-r--r--qa/workunits/snaps/snaptest-parents.sh2
-rwxr-xr-xqa/workunits/snaps/snaptest-snap-rm-cmp.sh2
-rwxr-xr-xqa/workunits/snaps/snaptest-upchildrealms.sh2
-rwxr-xr-xqa/workunits/snaps/snaptest-xattrwb.sh2
-rwxr-xr-xqa/workunits/snaps/untar_snap_rm.sh2
-rwxr-xr-xqa/workunits/suites/fsstress.sh2
-rw-r--r--src/.gitignore1
-rw-r--r--src/Makefile-env.am7
-rw-r--r--src/Makefile.am9
-rw-r--r--src/arch/intel.c7
-rwxr-xr-xsrc/ceph-create-keys2
-rwxr-xr-xsrc/ceph-disk30
-rwxr-xr-xsrc/ceph-rest-api2
-rwxr-xr-xsrc/ceph.in6
-rw-r--r--src/ceph_osd.cc2
-rw-r--r--src/client/Client.cc109
-rw-r--r--src/client/Client.h17
-rw-r--r--src/client/fuse_ll.cc20
-rw-r--r--src/common/Cond.h4
-rw-r--r--src/common/Makefile.am11
-rw-r--r--src/common/Mutex.h4
-rw-r--r--src/common/SloppyCRCMap.cc180
-rw-r--r--src/common/SloppyCRCMap.h78
-rw-r--r--src/common/TrackedOp.cc265
-rw-r--r--src/common/TrackedOp.h154
-rw-r--r--src/common/WorkQueue.h37
-rw-r--r--src/common/bloom_filter.cc137
-rw-r--r--src/common/bloom_filter.hpp700
-rw-r--r--src/common/buffer.cc9
-rw-r--r--src/common/ceph_argparse.cc17
-rw-r--r--src/common/ceph_json.cc4
-rw-r--r--src/common/ceph_strings.cc2
-rw-r--r--src/common/config_opts.h26
-rw-r--r--src/common/crc32c_intel_fast.c1
-rw-r--r--src/common/crc32c_intel_fast.h2
-rw-r--r--src/common/hobject.cc87
-rw-r--r--src/common/hobject.h125
-rw-r--r--src/common/safe_io.c80
-rw-r--r--src/common/safe_io.h9
-rw-r--r--src/common/util.cc1
-rw-r--r--src/crush/CrushWrapper.cc1
-rw-r--r--src/crush/CrushWrapper.h1
-rw-r--r--src/global/signal_handler.cc4
-rw-r--r--src/include/CompatSet.h46
-rw-r--r--src/include/Context.h20
-rw-r--r--src/include/Makefile.am2
-rw-r--r--src/include/bloom_filter.hpp544
-rw-r--r--src/include/buffer.h17
-rw-r--r--src/include/ceph_fs.h1
-rw-r--r--src/include/crc32c.h3
-rw-r--r--src/include/histogram.h76
-rw-r--r--src/include/rados.h2
-rw-r--r--src/include/rados/librados.h6
-rw-r--r--src/include/rados/librados.hpp20
-rw-r--r--src/include/types.h8
-rw-r--r--src/init-ceph.in24
-rw-r--r--src/librados/RadosClient.cc2
-rw-r--r--src/librados/librados.cc16
-rw-r--r--src/mds/CDentry.cc12
-rw-r--r--src/mds/CDentry.h4
-rw-r--r--src/mds/CDir.cc17
-rw-r--r--src/mds/CDir.h1
-rw-r--r--src/mds/CInode.cc7
-rw-r--r--src/mds/CInode.h3
-rw-r--r--src/mds/Locker.cc5
-rw-r--r--src/mds/LogEvent.cc14
-rw-r--r--src/mds/MDCache.cc291
-rw-r--r--src/mds/MDCache.h34
-rw-r--r--src/mds/MDLog.cc8
-rw-r--r--src/mds/MDS.cc1
-rw-r--r--src/mds/MDSMap.cc11
-rw-r--r--src/mds/MDSMap.h15
-rw-r--r--src/mds/Server.cc18
-rw-r--r--src/mds/flock.h2
-rw-r--r--src/mds/mdstypes.cc5
-rw-r--r--src/mds/mdstypes.h14
-rw-r--r--src/mon/MDSMonitor.cc30
-rw-r--r--src/mon/MonCommands.h27
-rw-r--r--src/mon/Monitor.cc252
-rw-r--r--src/mon/Monitor.h14
-rw-r--r--src/mon/MonmapMonitor.cc39
-rw-r--r--src/mon/OSDMonitor.cc39
-rw-r--r--src/mon/PGMap.cc58
-rw-r--r--src/mon/PGMap.h37
-rw-r--r--src/mon/PGMonitor.cc72
-rw-r--r--src/mon/PGMonitor.h1
-rw-r--r--src/msg/Pipe.cc13
-rw-r--r--src/msg/Pipe.h11
-rw-r--r--src/msg/msg_types.cc2
-rw-r--r--src/objclass/class_api.cc2
-rwxr-xr-xsrc/objsync/boto_del.py2
-rw-r--r--src/os/CollectionIndex.h20
-rw-r--r--src/os/DBObjectMap.cc197
-rw-r--r--src/os/DBObjectMap.h98
-rw-r--r--src/os/FDCache.h8
-rw-r--r--src/os/FileStore.cc582
-rw-r--r--src/os/FileStore.h194
-rw-r--r--src/os/FlatIndex.cc42
-rw-r--r--src/os/FlatIndex.h14
-rw-r--r--src/os/GenericFileStoreBackend.cc113
-rw-r--r--src/os/GenericFileStoreBackend.h15
-rw-r--r--src/os/HashIndex.cc88
-rw-r--r--src/os/HashIndex.h32
-rw-r--r--src/os/IndexManager.cc2
-rw-r--r--src/os/LFNIndex.cc271
-rw-r--r--src/os/LFNIndex.h90
-rw-r--r--src/os/Makefile.am3
-rw-r--r--src/os/ObjectMap.h44
-rw-r--r--src/os/ObjectStore.cc96
-rw-r--r--src/os/ObjectStore.h162
-rw-r--r--src/os/WBThrottle.cc22
-rw-r--r--src/os/WBThrottle.h40
-rw-r--r--src/osd/ErasureCodePluginJerasure/ErasureCodeJerasure.cc192
-rw-r--r--src/osd/ErasureCodePluginJerasure/ErasureCodeJerasure.h10
-rw-r--r--src/osd/Makefile.am4
-rw-r--r--src/osd/OSD.cc387
-rw-r--r--src/osd/OSD.h38
-rw-r--r--src/osd/OpRequest.cc263
-rw-r--r--src/osd/OpRequest.h106
-rw-r--r--src/osd/PG.cc158
-rw-r--r--src/osd/PG.h20
-rw-r--r--src/osd/PGBackend.h230
-rw-r--r--src/osd/PGLog.cc33
-rw-r--r--src/osd/ReplicatedBackend.cc268
-rw-r--r--src/osd/ReplicatedBackend.h329
-rw-r--r--src/osd/ReplicatedPG.cc1847
-rw-r--r--src/osd/ReplicatedPG.h515
-rw-r--r--src/osd/osd_types.cc44
-rw-r--r--src/osd/osd_types.h127
-rw-r--r--src/osdc/ObjectCacher.cc6
-rw-r--r--src/osdc/Objecter.cc2
-rw-r--r--src/osdc/Objecter.h44
-rw-r--r--src/pybind/ceph_argparse.py26
-rwxr-xr-xsrc/pybind/ceph_rest_api.py1
-rw-r--r--src/rbd.cc13
-rw-r--r--src/rbd_fuse/rbd-fuse.c2
-rw-r--r--src/rgw/rgw_metadata.cc2
-rw-r--r--src/rgw/rgw_rados.cc7
-rwxr-xr-xsrc/script/perf-watch.py2
-rw-r--r--src/test/Makefile.am32
-rw-r--r--src/test/ObjectMap/test_object_map.cc54
-rw-r--r--src/test/ObjectMap/test_store_tool/test_store_tool.cc90
-rw-r--r--src/test/ceph_compatset.cc164
-rw-r--r--src/test/cli-integration/rbd/formatted-output.t22
-rw-r--r--src/test/cli/radosgw-admin/help.t9
-rw-r--r--src/test/cli/rbd/help.t1
-rw-r--r--src/test/common/get_command_descriptions.cc116
-rw-r--r--src/test/common/test_bloom_filter.cc289
-rw-r--r--src/test/common/test_sloppy_crc_map.cc113
-rw-r--r--src/test/common/test_util.cc1
-rw-r--r--src/test/encoding/types.h12
-rw-r--r--src/test/filestore/FileStoreDiff.cc12
-rwxr-xr-xsrc/test/filestore/run_seed_to_range.sh2
-rw-r--r--src/test/filestore/store_test.cc124
-rw-r--r--src/test/filestore/workload_generator.cc4
-rw-r--r--src/test/librados/misc.cc78
-rw-r--r--src/test/os/TestFlatIndex.cc12
-rw-r--r--src/test/os/TestLFNIndex.cc79
-rw-r--r--src/test/osd/ErasureCodeExample.h11
-rw-r--r--src/test/osd/ErasureCodePluginExample.cc4
-rw-r--r--src/test/osd/ErasureCodePluginHangs.cc24
-rw-r--r--src/test/osd/Object.cc9
-rw-r--r--src/test/osd/RadosModel.h73
-rw-r--r--src/test/osd/TestErasureCodeExample.cc23
-rw-r--r--src/test/osd/TestErasureCodeJerasure.cc113
-rw-r--r--src/test/osd/TestErasureCodePlugin.cc14
-rw-r--r--src/test/osd/TestErasureCodePluginJerasure.cc15
-rw-r--r--src/test/osd/TestRados.cc9
-rwxr-xr-xsrc/test/pybind/test_ceph_argparse.py1061
-rw-r--r--src/tools/ceph-filestore-dump.cc135
-rw-r--r--src/tools/ceph-osdomap-tool.cc10
-rw-r--r--src/tools/dupstore.cc6
-rw-r--r--src/tools/rados/rados.cc61
-rwxr-xr-xsrc/vstart.sh5
236 files changed, 11521 insertions, 5144 deletions
diff --git a/.gitignore b/.gitignore
index 211c09cbba7..7e637866366 100644
--- a/.gitignore
+++ b/.gitignore
@@ -69,4 +69,10 @@ web/*.html
# dir from coverity tools
cov-int/
-/test-driver \ No newline at end of file
+/test-driver
+
+# gtags(1) generated files
+GPATH
+GRTAGS
+GSYMS
+GTAGS
diff --git a/.mailmap b/.mailmap
new file mode 100644
index 00000000000..fc4a1eb9ce9
--- /dev/null
+++ b/.mailmap
@@ -0,0 +1,84 @@
+Sage Weil <sage@inktank.com> <sage@newdream.net>
+Sage Weil <sage@inktank.com> <sage.weil@dreamhost.com>
+Sage Weil <sage@inktank.com> <sageweil@29311d96-e01e-0410-9327-a35deaab8ce9>
+Sage Weil <sage@inktank.com> <sage@29311d96-e01e-0410-9327-a35deaab8ce9>
+Sage Weil <sage@inktank.com> <sage@ceph0.dreamhost.com>
+Sage Weil <sage@inktank.com> <sage@skinny.ops.newdream.net>
+Sage Weil <sage@inktank.com> <sage@foil.westwood.newdream.net>
+Sage Weil <sage@inktank.com> <sage@vapre.localdomain>
+Sage Weil <sage@inktank.com> <sage.weil@inktank.com>
+Yehuda Sadeh <yehuda@inktank.com> <yehuda@hq.newdream.net>
+Yehuda Sadeh <yehuda@inktank.com> <yehuda.sadeh@dreamhost.com>
+Yehuda Sadeh <yehuda@inktank.com> <yehuda@yehuda.infit.com>
+Yehuda Sadeh <yehuda@inktank.com> <yehuda@yehuda>
+Yehuda Sadeh <yehuda@inktank.com> <yehudasa@fatty.ops.newdream.net>
+Yehuda Sadeh <yehuda@inktank.com> <yehudasa@gmail.com>
+Yehuda Sadeh <yehuda@inktank.com> <yehudasa@ceph0.dreamhost.com>
+Colin P. McCabe <colinm@hq.newdream.net> <cmccabe@alumni.cmu.edu>
+Colin P. McCabe <colinm@hq.newdream.net> <cmccabe@fatty.ops.newdream.net>
+Greg Farnum <greg@inktank.com> <gregf@hq.newdream.net>
+Greg Farnum <greg@inktank.com> <gregory.farnum@dreamhost.com>
+Greg Farnum <greg@inktank.com> Gregory Farnum <greg@inktank.com>
+Greg Farnum <greg@inktank.com> <greg@gregs42.com>
+Greg Farnum <greg@inktank.com> <gregf@skinny.ops.newdream.net>
+Greg Farnum <greg@inktank.com> <gfarnum@GF-Macbook.local>
+Samuel Just <sam.just@inktank.com> <samuel.just@dreamhost.com>
+Samuel Just <sam.just@inktank.com> <rexludorum@gmail.com>
+Samuel Just <sam.just@inktank.com> <samuelj@hq.newdream.net>
+Samuel Just <sam.just@inktank.com> <sam.just@dreamhost.com>
+Samuel Just <sam.just@inktank.com> <sam@Pondermatic.(none)>
+John Wilkins <john.wilkins@inktank.com> <john.wilkins@dreamhost.com>
+John Wilkins <john.wilkins@inktank.com> <john@admin-host.(none)>
+John Wilkins <john.wilkins@inktank.com> <johnw@johnw7664.(none)>
+Josh Durgin <josh.durgin@inktank.com> <josh.durgin@dreamhost.com>
+Josh Durgin <josh.durgin@inktank.com> <joshd@hq.newdream.net>
+Dan Mick <dan.mick@inktank.com> <dan.mick@dreamhost.com>
+Dan Mick <dan.mick@inktank.com> <dmick@danceorelse.org>
+Tommi Virtanen <tv@inktank.com> <tommi.virtanen@dreamhost.com>
+Tommi Virtanen <tv@inktank.com> <tv@hq.newdream.net>
+Tommi Virtanen <tv@inktank.com> <tv@eagain.net>
+João Eduardo Luís <joao.luis@inktank.com> <jecluis@gmail.com>
+João Eduardo Luís <joao.luis@inktank.com> Joao Eduardo Luis <joao.luis@inktank.com>
+Sam Lang <sam.lang@inktank.com> <samlang@gmail.com>
+Noah Watkins <noahwatkins@gmail.com> <jayhawk@cs.ucsc.edu>
+Gary Lowell <gary.lowell@inktank.com> <glowell@flab.ops.newdream.net>
+Gary Lowell <gary.lowell@inktank.com> <glowell@inktank.com>
+Patience Warnick <patience@cranium.pelton.net> <patiencew@29311d96-e01e-0410-9327-a35deaab8ce9>
+Wido den Hollander <wido@42on.com> <wido@widodh.nl>
+Michael Rodriguez <michael@newdream.net> <michael@squid.newdream.net>
+Michael Rodriguez <michael@newdream.net> <michael@newdream.net>
+Caleb Miles <caleb.miles@inktank.com> caleb miles <caselim@gmail.com>
+Caleb Miles <caleb.miles@inktank.com> caleb miles <caleb.miles@inktank.com>
+Caleb Miles <caleb.miles@inktank.com> Caleb Miles <caselim@gmail.com>
+Joe Buck <jbbuck@gmail.com> <buck@soe.ucsc.edu>
+Laszlo Boszormenyi <gcs@debian.hu> Laszlo Boszormenyi (GCS) <gcs@debian.hu>
+Roald J. van Loon <roaldvanloon@gmail.com> Roald van Loon <roaldvanloon@gmail.com>
+Alex Elder <elder@inktank.com> <elder@dreamhost.com>
+Alex Elder <elder@inktank.com> <elder@doink.(none)>
+Alex Elder <elder@inktank.com> <elder@speedy.(none)>
+Alexandre Marangone <alexandre.marangone@inktank.com> <a.marangone@gmail.com>
+Alexandre Oliva <oliva@gnu.org> <oliva@lsd.ic.unicamp.br>
+Alexandre Oliva <oliva@gnu.org> <lxoliva@fsfla.org>
+Ross Turk <ross.turk@inktank.com> <ross@inktank.com>
+Ross Turk <ross.turk@inktank.com> <ross.turk@dreamhost.com>
+Patrick McGarry <patrick@inktank.com> <pmcgarry@gmail.com>
+Patrick McGarry <patrick@inktank.com> scuttlemonkey <patrick@inktank.com>
+Mark Nelson <mark.nelson@inktank.com> <mark.a.nelson@gmail.com>
+Tamil Muthamizhan <tamil.muthamizhan@inktank.com> <tamil@ubuntu.(none)>
+Tamil Muthamizhan <tamil.muthamizhan@inktank.com> tamil <tamil.muthamizhan@inktank.com>
+Tamil Muthamizhan <tamil.muthamizhan@inktank.com> <tamil@tamil-VirtualBox.(none)>
+Christian Brunner <christian@brunner-muc.de> <chb@muc.de>
+Henry C Chang <henry_c_chang@tcloudcomputing.com> <henry.cy.chang@gmail.com>
+Alfredo Deza <alfredo.deza@inktank.com> <alfredo@deza.pe>
+Sylvain Munaut <s.munaut@whatever-company.com> <tnt@246tNt.com>
+Erwin, Brock A <Brock.Erwin@pnl.gov> <Brock.Erwin@pnl.govgit>
+Kacper Kowalik <xarthisius@gentoo.org> Kacper Kowalik (Xarthisius) <xarthisius@gentoo.org>
+Neil Levine <neil.levine@inktank.com> <levine@yoyo.org>
+Guilhem Lettron <guilhem@lettron.fr> <guilhem+github@lettron.fr>
+Holger Macht <hmacht@suse.de> <holger@homac.de>
+Volker Assmann <volker@twisted-nerve.de> <volker@stan.local>
+Volker Assmann <volker@twisted-nerve.de> <volker@36-135.mops.RWTH-Aachen.DE>
+Sebastien Han <sebastien.han@enovance.com> <sebastien.han@enovance.com>
+Matthew Roy <matthew@royhousehold.net> <matthew@matthew-ubuntu.(none)>
+Matthew Roy <matthew@royhousehold.net> <mroy@sandbox-ed.com>
+Matthew Wodrich <matthew.wodrich@dreamhost.com> <mattheww@Mattsbox.(none)>
diff --git a/COPYING b/COPYING
index 920b049b7fa..a0034d58c3b 100644
--- a/COPYING
+++ b/COPYING
@@ -1,3 +1,8 @@
+Format-Specification: http://anonscm.debian.org/viewvc/dep/web/deps/dep5/copyright-format.xml?revision=279&view=markup
+Name: ceph
+Maintainer: Sage Weil <sage@newdream.net>
+Source: http://ceph.com/
+
Files: *
Copyright: (c) 2004-2010 by Sage Weil <sage@newdream.net>
License: LGPL2.1 (see COPYING-LGPL2.1)
@@ -18,6 +23,10 @@ Files: src/include/ceph_hash.cc
Copyright: None
License: Public domain
+Files: src/common/bloom_filter.hpp
+Copyright: Copyright (C) 2000 Arash Partow <arash@partow.net>
+License: Boost Software License, Version 1.0
+
Files: m4/acx_pthread.m4
Copyright: Steven G. Johnson <stevenj@alum.mit.edu>
License: GPLWithACException
@@ -94,33 +103,38 @@ Copyright: Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com>
License: LGPL2 or later
Files: src/osd/ErasureCodePluginJerasure/*.{c,h}
-Copyright (c) 2011, James S. Plank <plank@cs.utk.edu>
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions
-are met:
-
- - Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
-
- - Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in
- the documentation and/or other materials provided with the
- distribution.
-
- - Neither the name of the University of Tennessee nor the names of its
- contributors may be used to endorse or promote products derived
- from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
-OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
-AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
-WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
+Copyright: Copyright (c) 2011, James S. Plank <plank@cs.utk.edu>
+License:
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+
+ - Neither the name of the University of Tennessee nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ POSSIBILITY OF SUCH DAMAGE.
+
+Packaging:
+ Copyright (C) 2004-2009 by Sage Weil <sage@newdream.net>
+ Copyright (C) 2010 Canonical, Ltd.
+ Licensed under LGPL-2.1
diff --git a/PendingReleaseNotes b/PendingReleaseNotes
index e7fcd7201bb..a3ec73290f3 100644
--- a/PendingReleaseNotes
+++ b/PendingReleaseNotes
@@ -1,2 +1,23 @@
-v0.69
+v0.71
~~~~~
+
+* The MDS now disallows snapshots by default as they are not
+ considered stable. The command 'ceph mds set allow_snaps' will
+ enable them.
+
+* For clusters that were created before v0.44 (pre-argonaut, Spring
+ 2012) and store radosgw data, the auto-upgrade from TMAP to OMAP
+ objects has been disabled. Before upgrading, make sure that any
+ buckets created on pre-argonaut releases have been modified (e.g.,
+ by PUTing and then DELETEing an object from each bucket). Any
+ cluster created with argonaut (v0.48) or a later release or not
+ using radosgw never relied on the automatic conversion and is not
+ affected by this change.
+
+* Any direct users of the 'tmap' portion of the librados API should be
+ aware that the automatic tmap -> omap conversion functionality has
+ been removed.
+
+* Most output that used K or KB (e.g., for kilobyte) now uses a
+ lower-case k to match the official SI convention. Any scripts that
+ parse output and check for an upper-case K will need to be modified.
diff --git a/ceph.spec.in b/ceph.spec.in
index 851ee7acfd5..a60d87ad814 100644
--- a/ceph.spec.in
+++ b/ceph.spec.in
@@ -37,6 +37,7 @@ BuildRequires: perl
BuildRequires: gdbm
BuildRequires: pkgconfig
BuildRequires: python
+BuildRequires: python-nose
BuildRequires: libaio-devel
BuildRequires: libcurl-devel
BuildRequires: libxml2-devel
diff --git a/configure.ac b/configure.ac
index eeecdbeffc8..1eee4609ec1 100644
--- a/configure.ac
+++ b/configure.ac
@@ -8,7 +8,7 @@ AC_PREREQ(2.59)
# VERSION define is not used by the code. It gets a version string
# from 'git describe'; see src/ceph_ver.[ch]
-AC_INIT([ceph], [0.69], [ceph-devel@vger.kernel.org])
+AC_INIT([ceph], [0.70], [ceph-devel@vger.kernel.org])
# Create release string. Used with VERSION for RPMs.
RPM_RELEASE=0
diff --git a/debian/changelog b/debian/changelog
index ce73472f9eb..4628bb52175 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,3 +1,9 @@
+ceph (0.70-1) stable; urgency=low
+
+ * New upstream release
+
+ -- Gary Lowell <gary.lowell@inktank.com> Fri, 04 Oct 2013 20:11:51 +0000
+
ceph (0.69-1) precise; urgency=low
* New upstream release
diff --git a/debian/control b/debian/control
index 44ee725efd4..1aec592c9f8 100644
--- a/debian/control
+++ b/debian/control
@@ -34,6 +34,7 @@ Build-Depends: autoconf,
libxml2-dev,
pkg-config,
python (>= 2.6.6-3~),
+ python-nose,
uuid-dev,
yasm
Standards-Version: 3.9.3
diff --git a/debian/copyright b/debian/copyright
index d11a0f7f5da..d3906c44d35 100644
--- a/debian/copyright
+++ b/debian/copyright
@@ -1,11 +1,15 @@
-Format-Specification: http://svn.debian.org/wsvn/dep/web/deps/dep5.mdwn?op=file&rev=135
+Format-Specification: http://anonscm.debian.org/viewvc/dep/web/deps/dep5/copyright-format.xml?revision=279&view=markup
Name: ceph
Maintainer: Sage Weil <sage@newdream.net>
Source: http://ceph.com/
Files: *
Copyright: (c) 2004-2010 by Sage Weil <sage@newdream.net>
-License: LGPL2.1 (see /usr/share/common-licenses/LGPL-2.1)
+License: LGPL2.1 (see COPYING-LGPL2.1)
+
+Files: doc/*
+Copyright: (c) 2010-2012 New Dream Network and contributors
+License: Creative Commons Attribution-ShareAlike (CC BY-SA)
Files: src/mount/canonicalize.c
Copyright: Copyright (C) 1993 Rick Sladkey <jrs@world.std.com>
@@ -19,6 +23,10 @@ Files: src/include/ceph_hash.cc
Copyright: None
License: Public domain
+Files: src/common/bloom_filter.hpp
+Copyright: Copyright (C) 2000 Arash Partow
+License: Boost Software License, Version 1.0
+
Files: m4/acx_pthread.m4
Copyright: Steven G. Johnson <stevenj@alum.mit.edu>
License: GPLWithACException
@@ -28,25 +36,25 @@ Copyright:
Copyright 2012-2013 Intel Corporation All Rights Reserved.
License: BSD 3-clause
-Files: src/common/sctp_crc32.c:
+Files: src/common/sctp_crc32.c:
Copyright:
Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved.
Copyright (c) 2004-2006 Intel Corporation - All Rights Reserved
License:
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
-
+
a) Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
-
+
b) Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the distribution.
-
+
c) Neither the name of Cisco Systems, Inc. nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
-
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
@@ -88,6 +96,44 @@ License:
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.
+
+
+Files: src/test/common/Throttle.cc src/test/filestore/chain_xattr.cc
+Copyright: Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com>
+License: LGPL2 or later
+
+Files: src/osd/ErasureCodePluginJerasure/*.{c,h}
+Copyright: Copyright (c) 2011, James S. Plank <plank@cs.utk.edu>
+License:
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+
+ - Neither the name of the University of Tennessee nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ POSSIBILITY OF SUCH DAMAGE.
+
Packaging:
Copyright (C) 2004-2009 by Sage Weil <sage@newdream.net>
Copyright (C) 2010 Canonical, Ltd.
diff --git a/doc/architecture.rst b/doc/architecture.rst
index 9f57bbbd58a..988475f53b6 100644
--- a/doc/architecture.rst
+++ b/doc/architecture.rst
@@ -387,7 +387,7 @@ steps to compute PG IDs.
#. CRUSH calculates the hash modulo the number of OSDs. (e.g., ``0x58``) to get
a PG ID.
#. CRUSH gets the pool ID given the pool name (e.g., "liverpool" = ``4``)
-#. CRUSH prepends the pool ID to the pool ID to the PG ID (e.g., ``4.0x58``).
+#. CRUSH prepends the pool ID to the PG ID (e.g., ``4.0x58``).
Computing object locations is much faster than performing object location query
over a chatty session. The :abbr:`CRUSH (Controlled Replication Under Scalable
diff --git a/doc/changelog/v0.67.4.txt b/doc/changelog/v0.67.4.txt
new file mode 100644
index 00000000000..73b997ea304
--- /dev/null
+++ b/doc/changelog/v0.67.4.txt
@@ -0,0 +1,550 @@
+commit ad85b8bfafea6232d64cb7ba76a8b6e8252fa0c7
+Author: Gary Lowell <gary.lowell@inktank.com>
+Date: Thu Oct 3 22:41:31 2013 +0000
+
+ v0.67.4
+
+commit 5cd66d3b4bca92b402c95ab256fbc3f0329c446f
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date: Fri Sep 20 14:04:47 2013 -0700
+
+ rgw: fix keystone token expiration test
+
+ Fixes: #6360
+ The test was inverted, need expiration to be greater than
+ current time in order for token to be valid.
+
+ Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+ Reviewed-by: Sage Weil <sage@inktank.com>
+
+commit e0203c61a3f45fdd6d3d3ece26fef6152bdc036d
+Author: David Zafman <david.zafman@inktank.com>
+Date: Wed Sep 11 16:55:06 2013 -0700
+
+ osd/OSD.cc: Use MIN() so that we don't exceed osd_recovery_max_active
+
+ Caused by 944f3b73531af791c90f0f061280160003545c63
+
+ Fixes: #6291
+
+ Backport: dumpling
+
+ Signed-off-by: David Zafman <david.zafman@inktank.com>
+ Reviewed-by: Samuel Just <sam.just@inktank.com>
+ (cherry picked from commit 139a714e13aa3c7f42091270b55dde8a17b3c4b8)
+
+ Conflicts:
+
+ src/osd/OSD.cc
+
+commit c376708358cedb5561fbb43e9b9e622df3ea7a58
+Author: Joao Eduardo Luis <joao.luis@inktank.com>
+Date: Wed Sep 25 22:08:24 2013 +0100
+
+ mon: OSDMonitor: do not write full_latest during trim
+
+ On commit 81983bab we patched OSDMonitor::update_from_paxos() such that we
+ write the latest full map version to 'full_latest' each time the latest
+ full map was built from the incremental versions.
+
+ This change however clashed with OSDMonitor::encode_trim_extra(), which
+ also wrote to 'full_latest' on each trim, writing instead the version of
+ the *oldest* full map. This duality of behaviors could lead the store
+ to an inconsistent state across the monitors (although there's no sign of
+ it actually imposing any issues besides rebuilding already existing full
+ maps on some monitors).
+
+ We now stop OSDMonitor::encode_trim_extra() from writing to 'full_latest'.
+ This function will still write out the oldest full map it has in the store,
+ but it will no longer write to full_latest, instead leaving it up to
+ OSDMonitor::update_from_paxos() to figure it out -- and it already does.
+
+ Fixes: #6378
+
+ Backport: dumpling
+
+ Signed-off-by: Joao Eduardo Luis <joao.luis@inktank.com>
+ Reviewed-by: Sage Weil <sage@inktank.com>
+ (cherry picked from commit bd0f29a2c28cca496ec830eac932477ebf3182ba)
+
+commit de40d0b3e35ab0124cd3c4ebfcaa435ab8abfab9
+Author: Sage Weil <sage@inktank.com>
+Date: Tue Oct 1 15:53:42 2013 -0700
+
+ crush: invalidate rmap on create (and thus decode)
+
+ If we have an existing CrushWrapper object and decode from a bufferlist,
+ reset build_rmaps so that they get rebuilt.
+
+ Remove the build_rmaps() all in decode that was useless on a redecode
+ (because have_rmaps == true in that case and it did nothing).
+
+ Fixes: #6442
+ Backport: dumpling, maybe cuttlefish
+ Signed-off-by: Sage Weil <sage@inktank.com>
+ Reviewed-by: Joao Eduardo Luis <joao.luis@inktank.com>
+ (cherry picked from commit 9b7a2ae329b6a511064dd3d6e549ba61f52cfd21)
+
+commit 32f5233288c47d95b87c0a9cab5f9c2ffcf15417
+Author: Dan Mick <dan.mick@inktank.com>
+Date: Mon Sep 30 14:58:11 2013 -0700
+
+ Invoke python with /usr/bin/env python instead of directly
+
+ Fixes: #6311
+ Signed-off-by: Dan Mick <dan.mick@inktank.com>
+ (cherry picked from commit b9000b314b9166845ff302d4a827a996775d9a14)
+
+commit 66aeca5a9079be398403bbff67bd5bf68c6fb111
+Author: Sage Weil <sage@inktank.com>
+Date: Wed Sep 25 10:10:21 2013 -0700
+
+ qa/workunits/mon/crush_ops.sh: fix test
+
+ Fix root.
+
+ Fixes: #6392
+ Signed-off-by: Sage Weil <sage@inktank.com>
+ (cherry picked from commit c8cae87e9e08468cc86145e0fd60c05d12826239)
+
+commit beb366302a125dd422c4f092b12eb541cb3bc788
+Author: Sage Weil <sage@inktank.com>
+Date: Mon Sep 23 09:04:34 2013 -0700
+
+ Revert "ceph: parse CEPH_ARGS environment variable"
+
+ This reverts commit 67a95b9880c9bc6e858150352318d68d64ed74ad.
+
+ We now put CEPH_ARGS in the actual args we parse in python, which are passed
+ to rados piecemeal later. This lets you put things like --id ... in there
+ that need to be parsed before librados is initialized.
+ (cherry picked from commit 97f462be4829f0167ed3d65e6694dfc16f1f3243)
+
+commit b475ff9576f145d31c053213c699e13df76d2bcb
+Author: Benoît Knecht <benoit.knecht@fsfe.org>
+Date: Mon Sep 23 15:58:42 2013 +0200
+
+ Add CEPH_ARGS at the end of sys.argv
+
+ This allows, for instance, to pass a different client name to ceph by
+ exporting CEPH_ARGS="--id client_id".
+
+ Signed-off-by: Benoît Knecht <benoit.knecht@fsfe.org>
+ Signed-off-by: Sage Weil <sage@inktank.com>
+ (cherry picked from commit 30abe3244c86cbbe1f5b005850c29c9c0eafcad4)
+
+commit 94548b4b67cca37366c7d8719209a6d2e7956811
+Author: Sage Weil <sage@inktank.com>
+Date: Tue Sep 24 15:26:03 2013 -0700
+
+ mon/OSDMonitor: fix 'ceph osd crush reweight ...'
+
+ The adjust method returns a count of adjusted items.
+
+ Add a test.
+
+ Fixes: #6382
+ Backport: dumpling
+ Signed-off-by: Sage Weil <sage@inktank.com>
+ Reviewed-by: Dan Mick <dan.mick@inktank.com>
+ (cherry picked from commit 3de32562b55c6ece3a6ed783c36f8b9f21460339)
+
+commit 00ff7f5c20e13869d0694379739ba4e61d44b97c
+Author: Joao Eduardo Luis <joao.luis@inktank.com>
+Date: Tue Sep 10 00:20:41 2013 +0100
+
+ qa: workunits: mon: crush_ops: test 'ceph osd crush move'
+
+ Signed-off-by: Joao Eduardo Luis <joao.luis@inktank.com>
+ (cherry picked from commit 3bc618b7b46496c5110edde0da9cae5d3e68e0e1)
+
+commit 0ff5b4a96833681e92cc41f019a569134474f4cf
+Author: Loic Dachary <loic@dachary.org>
+Date: Tue Sep 24 19:04:23 2013 +0200
+
+ osd: change warn_interval_multiplier to uint32_t
+
+ to prevent overflow in OpTracker::check_ops_in_flight when
+ multiplying warn_interval_multiplier *= 2
+
+ Backport: cuttlefish, dumpling
+
+ http://tracker.ceph.com/issues/6370 fixes #6370
+
+ Signed-off-by: Loic Dachary <loic@dachary.org>
+ (cherry picked from commit 1bce1f009bffd3e28025a08775fec189907a81db)
+
+commit fb15040b6cec6221baa550ddfffade823f784c4a
+Author: David Zafman <david.zafman@inktank.com>
+Date: Mon Sep 9 13:01:12 2013 -0700
+
+ crushtool: do not dump core with non-unique bucket IDs
+
+ Return -EEXIST on duplicate ID
+ BUG FIX: crush_add_bucket() mixes error returns and IDs
+ Add optional argument to return generated ID
+
+ Fixes: #6246
+
+ Signed-off-by: David Zafman <david.zafman@inktank.com>
+ Reviewed-by: Sage Weil <sage@inktank.com>
+ (cherry picked from commit 8c76f3a0f9cf100ea2c941dc2b61c470aa5033d7)
+
+commit 410db3f30c6eb54b807908c1f251ad4026e7d446
+Author: Joao Eduardo Luis <jecluis@gmail.com>
+Date: Fri Sep 20 17:06:30 2013 +0100
+
+ qa: workunits: cephtool: check if 'heap' commands are parseable
+
+ Signed-off-by: Joao Eduardo Luis <jecluis@gmail.com>
+ (cherry picked from commit b1eeaddd5f214c1b0883b44fc8cae07c649be7c4)
+
+commit 062060a38bb26ff260cc51accc534413d726de49
+Author: Joao Eduardo Luis <jecluis@gmail.com>
+Date: Fri Sep 20 17:50:27 2013 +0100
+
+ osd: OSD: add 'heap' command to known osd commands array
+
+ Must have been forgotten during the cli rework.
+
+ Backport: dumpling
+
+ Signed-off-by: Joao Eduardo Luis <jecluis@gmail.com>
+ (cherry picked from commit 296f2d0db31e9f5a59a3a62a1e95b6c440430fa3)
+
+commit 3f32f57b98e0224a1d30b2a81d7d260be0f53800
+Author: Joao Eduardo Luis <jecluis@gmail.com>
+Date: Fri Sep 20 16:43:27 2013 +0100
+
+ mds: MDS: pass only heap profiler commands instead of the whole cmd vector
+
+ The heap profiler doesn't care, nor should it, what our command name is.
+ It only cares about the commands it handles.
+
+ Backport: dumpling
+
+ Signed-off-by: Joao Eduardo Luis <jecluis@gmail.com>
+ (cherry picked from commit 238fe272c6bdb62d4e57fd8555c0136de99c8129)
+
+commit 46dcc46617d8f35ab8433540b22343ddcbcc3716
+Author: Joao Eduardo Luis <jecluis@gmail.com>
+Date: Fri Sep 20 16:41:14 2013 +0100
+
+ perfglue/heap_profiler.cc: expect args as first element on cmd vector
+
+ We used to pass 'heap' as the first element of the cmd vector when
+ handling commands. We haven't been doing so for a while now, so we
+ needed to fix this.
+
+ Not expecting 'heap' also makes sense, considering that what we need to
+ know when we reach this function is what command we should handle, and
+ we should not care what the caller calls us when handling his business.
+
+ Fixes: #6361
+ Backport: dumpling
+
+ Signed-off-by: Joao Eduardo Luis <jecluis@gmail.com>
+ (cherry picked from commit c98b910d49bd2b46ceafdc430044a31524c29f5b)
+
+commit 9dc5f15fbae22244ad1f62925e17c9d81e856e55
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date: Mon Sep 16 14:35:25 2013 -0700
+
+ rgw: destroy get_obj handle in copy_obj()
+
+ Fixes: #6176
+ Backport: dumpling
+ We take different code paths in copy_obj, make sure we close the handle
+ when we exit the function. Move the call to finish_get_obj() out of
+ copy_obj_data() as we don't create the handle there, so that should
+ makes code less confusing and less prone to errors.
+ Also, note that RGWRados::get_obj() also calls finish_get_obj(). For
+ everything to work in concert we need to pass a pointer to the handle
+ and not the handle itself. Therefore we needed to also change the call
+ to copy_obj_data().
+
+ Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
+ Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+ (cherry picked from commit 9e98620e4325d15c88440a890b267131613e1aa1)
+
+commit 471233e98a9f64ad513a4a196b7661b80534cb00
+Author: Joao Eduardo Luis <joao.luis@inktank.com>
+Date: Mon Sep 9 23:14:11 2013 +0100
+
+ mon: MonCommands: expect a CephString as 1st arg for 'osd crush move'
+
+ Fixes: #6230
+
+ Signed-off-by: Joao Eduardo Luis <joao.luis@inktank.com>
+ (cherry picked from commit 7d3799fde19138f957f26ec6be10a8a0000fc1f0)
+
+commit 2908225092bd2aa1b8afcb7848c1cdac5bd9e638
+Author: Sage Weil <sage@inktank.com>
+Date: Mon Sep 23 16:23:33 2013 -0700
+
+ osd: revert 'osd max xattr size' limit
+
+ Set it to 0 (unlimited) for now.
+
+ Backport: dumpling
+
+ Signed-off-by: Sage Weil <sage@inktank.com>
+ Reviewed-by: Yehuda Sadeh <yehuda@inktank.com>
+ (cherry picked from commit abb88d70643c3a76435b7a9d5b04ff29f7502361)
+
+commit b3d3b3747c1eef695138dac828e5fcb435309c7b
+Author: Greg Farnum <greg@inktank.com>
+Date: Wed Sep 11 16:24:32 2013 -0700
+
+ mds: be more careful about decoding LogEvents
+
+ We need to wrap the full decode section or we can abort the process
+ if there's an issue (which we may want to just skip by).
+
+ Signed-off-by: Greg Farnum <greg@inktank.com>
+ Reviewed-by: Sage Weil <sage@inktank.com>
+ (cherry picked from commit 73289b34b0be5b6612e38944794d59b5e789f841)
+
+commit 06c58132199ed22413b509dfa751321ccdb24225
+Author: Joao Eduardo Luis <joao.luis@inktank.com>
+Date: Tue Sep 17 17:58:20 2013 +0100
+
+ mon: OSDMonitor: multiple rebuilt full maps per transaction
+
+ Signed-off-by: Joao Eduardo Luis <joao.luis@inktank.com>
+ (cherry picked from commit 0d20cae0be701c5b6151a26ee5e4fe24d89aa20a)
+
+commit 65bbcaf4b68790dae4506c1f5db237077e1ff0ae
+Author: Joao Eduardo Luis <joao.luis@inktank.com>
+Date: Sun Sep 15 21:03:50 2013 +0100
+
+ mon: OSDMonitor: update latest_full while rebuilding full maps
+
+ Not doing so will make the monitor rebuild the osdmap full versions, even
+ though they may have been rebuilt before, every time the monitor starts.
+
+ This mostly happens when the cluster is left in an unhealthy state for
+ a long period of time and incremental versions build up. Even though we
+ build the full maps on update_from_paxos(), not updating 'full_latest'
+ leads to the situation initially described.
+
+ Fixes: #6322
+
+ Signed-off-by: Joao Eduardo Luis <joao.luis@inktank.com>
+ (cherry picked from commit 81983bab3630520d6c7ee9b7e4a747bc17b8c5c3)
+
+commit 9b9edb04581cca15e67c567332529f5b3f426743
+Author: Joao Eduardo Luis <joao.luis@inktank.com>
+Date: Sun Sep 15 21:00:55 2013 +0100
+
+ mon: OSDMonitor: smaller transactions when rebuilding full versions
+
+ Otherwise, for considerably sized rebuilds, the monitor will not only
+ consume vast amounts of memory, but it will also have troubles committing
+ the transaction. Anyway, it's also a good idea to adjust transactions to
+ the granularity we want, and to be fair we care that each rebuilt full map
+ gets to disk, even if subsequent full maps don't (those can be rebuilt
+ later).
+
+ Fixes: #6323
+
+ Signed-off-by: Joao Eduardo Luis <joao.luis@inktank.com>
+ (cherry picked from commit 4ac1570c5cdcd6556dc291cc6d7878fd92d343ae)
+
+commit 298811f7a15541b9ec1015c416ad2aa075be5691
+Author: Joao Eduardo Luis <jecluis@gmail.com>
+Date: Wed Aug 28 15:51:01 2013 +0100
+
+ mon: OSDMonitor: check if pool is on unmanaged snaps mode on mk/rmsnap
+
+ Backport: dumpling
+ Fixes: #6047
+
+ Signed-off-by: Joao Eduardo Luis <jecluis@gmail.com>
+ (cherry picked from commit fab79543c54c2e446d3f76520d7906645c6b0075)
+
+commit a992664435db9dde3745eb7f354cce3fc5400a47
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date: Thu Sep 12 14:32:17 2013 -0700
+
+ lru_map: don't use list::size()
+
+ replace list::size() with map::size(), which should have
+ a constant time complexity.
+
+ Reviewed-by: Sage Weil <sage@inktank.com>
+ Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+ (cherry picked from commit 7c1d2ded8fa8061bf3f14932800998b963745dd1)
+
+commit 788546ea71c994ff35323747294ed9c177fe7020
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date: Thu Sep 12 14:30:19 2013 -0700
+
+ common/lru_map: rename tokens to entries
+
+ This code was originally used in a token cache, now
+ as a generic infrastructure rename token fields.
+
+ Reviewed-by: Sage Weil <sage@inktank.com>
+ Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+ (cherry picked from commit 532e41a9985a16b35a6e49cdcba38af0ad166fa8)
+
+commit babeb00c42af760b3e7575166479e95365cfcc0a
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date: Wed Sep 18 10:37:21 2013 -0700
+
+ rgw: use bufferlist::append() instead of bufferlist::push_back()
+
+ push_back() expects char *, whereas append can append a single char.
+ Appending a NULL char to push_back is cast as a NULL pointer which is
+ bad.
+
+ Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+ Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
+ (cherry picked from commit 08fe028bad13096d482454a2f303158727c363ff)
+
+commit daf85c45dd4d158bc7c33a2fb784857bc7db35cd
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date: Wed Sep 11 13:46:31 2013 -0700
+
+ rgw: NULL terminate buffer before parsing it
+
+ Fixes: #6175
+ Backport: dumpling
+ We get a buffer off the remote gateway which might
+ not be NULL terminated. The JSON parser needs the
+ buffer to be NULL terminated even though we provide
+ a buffer length as it calls strlen().
+
+ Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
+ Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+ (cherry picked from commit e7f7483192cddca1159aba439ce62b1e78669d51)
+
+commit c73040a5518971813b9ebaae1624c5bacef315d0
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date: Wed Sep 11 22:30:12 2013 -0700
+
+ rgw: don't call list::size() in ObjectCache
+
+ Fixes: #6286
+ Use an external counter instead of calling list::size()
+
+ Reviewed-by: Sage Weil <sage@inktank.com>
+ Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+ (cherry picked from commit 31e3a51e933429d286104fe077e98ea883437ad6)
+
+commit a855aba9d18936e9a060119e041518790cd4b831
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date: Tue Sep 10 12:18:55 2013 -0700
+
+ rgw: drain pending requests before completing write
+
+ Fixes: #6268
+ When doing aio write of objects (either regular or multipart parts) we
+ need to drain pending aio requests. Otherwise if gateway goes down then
+ object might end up corrupted.
+
+ Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
+ Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+ (cherry picked from commit 626669afaa333d73707553a85f5c874e99e9cbd8)
+
+commit 670db7e80ddc9c26c43a4f66907a5996ce207c4d
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date: Fri Sep 6 22:33:38 2013 -0700
+
+ rgw: fix get cors, delete cors
+
+ Remove a couple of variables that overrode class member. Not
+ really clear how it was working before, might have been a bad
+ merge / rebase.
+
+ Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+ (cherry picked from commit 13872785aeeddbe1b8dd97e49fd6a2d879514f8d)
+
+commit a304016fa01b02efd500135c00b9bf3407a9999c
+Merge: 408cd61 ac0a30f
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date: Wed Sep 11 09:47:10 2013 -0700
+
+ Merge branch 'wip-6078-dumpling' into dumpling
+
+ Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
+
+commit ac0a30feb8c64a3b80d9c519a7b561213403afab
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date: Wed Aug 28 21:25:20 2013 -0700
+
+ rgw: fix certain return status cases in CORS
+
+ Change return values in certain cases, reorder
+ checks, etc.
+
+ Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+
+commit 13b28cc3f1eb8ef42875b630c485ee0105cd244a
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date: Wed Aug 28 21:24:36 2013 -0700
+
+ rgw: add COPY method to be handled by CORS
+
+ Was missing this http method.
+
+ Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+
+commit d45c87ea738807487e72c0719b0d3d459cbe19e9
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date: Tue Aug 27 19:38:45 2013 -0700
+
+ rgw: fix CORS rule check
+
+ Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+
+commit 986fa92a7a1d88111ba28457160adfcfdaabc5d2
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date: Tue Aug 27 19:38:18 2013 -0700
+
+ rgw: don't handle CORS if rule not found (is NULL)
+
+ Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+
+commit 71873aba6553492d3ad71596cefd7c841030a277
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date: Thu Aug 22 13:38:55 2013 -0700
+
+ rgw: tie CORS header response to all relevant operations
+
+ Have the CORS responses on all relevant operations. Also add headers
+ on failure cases.
+
+ Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+
+commit 94e7b594d85dbd26e58d823b41f418032e9f163f
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date: Thu Aug 22 10:00:53 2013 -0700
+
+ rgw: add a generic CORS response handling
+
+ Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+
+commit c3385d8a102faf5379559bb98cf89637ceda1579
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date: Wed Aug 21 17:22:46 2013 -0700
+
+ rgw: OPTIONS request doesn't need to read object info
+
+ This is a bucket-only operation, so we shouldn't look at the
+ object. Object may not exist and we might respond with Not
+ Exists response which is not what we want.
+
+ Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+
+commit a5fdd44e5d8ce4b8d82273d83e27aea19e63aa7c
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date: Wed Aug 21 14:43:28 2013 -0700
+
+ rgw: remove use of s->bucket_cors
+
+ Some old code still tried to use s->bucket_cors, which was
+ abandoned in a cleanup work.
+
+ Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
diff --git a/doc/dev/osd_internals/erasure_coding.rst b/doc/dev/osd_internals/erasure_coding.rst
index cc1efe4b4bf..0586c46c3bb 100644
--- a/doc/dev/osd_internals/erasure_coding.rst
+++ b/doc/dev/osd_internals/erasure_coding.rst
@@ -3,8 +3,8 @@ Erasure Coded Placement Groups
==============================
The documentation of the erasure coding implementation in Ceph was
-created in July 2013. It is included in Ceph even before erasure
-coding is available because it drives a number of architectural
+created in July 2013. It is included in Ceph even before erasure coded
+pools are available because it drives a number of architectural
changes. It is meant to be updated to reflect the `progress of these
architectural changes <http://tracker.ceph.com/issues/4929>`_, up to
the point where it becomes a reference of the erasure coding
@@ -14,8 +14,14 @@ Glossary
--------
*chunk*
- when the encoding function is called, it returns chunks of the
- same size.
+ when the encoding function is called, it returns chunks of the same
+ size. Data chunks which can be concated to reconstruct the original
+ object and coding chunks which can be used to rebuild a lost chunk.
+
+*chunk rank*
+ the index of a chunk when returned by the encoding function. The
+ rank of the first chunk is 0, the rank of the second chunk is 1
+ etc.
*stripe*
when an object is too large to be encoded with a single call,
@@ -23,9 +29,13 @@ Glossary
called a stripe.
*shard|strip*
- the file that holds all chunks of a same rank for a given object.
+ an ordered sequence of chunks of the same rank from the same
+ object. For a given placement group, each OSD contains shards of
+ the same rank. When dealing with objects that are encoded with a
+ single operation, *chunk* is sometime used instead of *shard*
+ because the shard is made of a single chunk.
-Example:
+The definitions are illustrated as follows:
::
OSD 40 OSD 33
@@ -53,6 +63,6 @@ Table of content
.. toctree::
:maxdepth: 1
- High level design document <erasure_coding/pgbackend>
Developer notes <erasure_coding/developer_notes>
- Draft PGBackend.h header <erasure_coding/PGBackend-h>
+ Jerasure plugin <erasure_coding/jerasure>
+ High level design document <erasure_coding/pgbackend>
diff --git a/doc/dev/osd_internals/erasure_coding/PGBackend-h.rst b/doc/dev/osd_internals/erasure_coding/PGBackend-h.rst
deleted file mode 100644
index b39cdb0e88e..00000000000
--- a/doc/dev/osd_internals/erasure_coding/PGBackend-h.rst
+++ /dev/null
@@ -1,156 +0,0 @@
-===========
-PGBackend.h
-===========
-
-Work in progress:
-::
-
- /**
- * PGBackend
- *
- * PGBackend defines an interface for logic handling IO and
- * replication on RADOS objects. The PGBackend implementation
- * is responsible for:
- *
- * 1) Handling client operations
- * 2) Handling object recovery
- * 3) Handling object access
- */
- class PGBackend {
- public:
- /// IO
-
- /// Perform write
- int perform_write(
- const vector<OSDOp> &ops, ///< [in] ops to perform
- Context *onreadable, ///< [in] called when readable on all reaplicas
- Context *onreadable, ///< [in] called when durable on all replicas
- ) = 0; ///< @return 0 or error
-
- /// Attempt to roll back a log entry
- int try_rollback(
- const pg_log_entry_t &entry, ///< [in] entry to roll back
- ObjectStore::Transaction *t ///< [out] transaction
- ) = 0; ///< @return 0 on success, -EINVAL if it can't be rolled back
-
- /// Perform async read, oncomplete is called when ops out_bls are filled in
- int perform_read(
- vector<OSDOp> &ops, ///< [in, out] ops
- Context *oncomplete ///< [out] called with r code
- ) = 0; ///< @return 0 or error
-
- /// Peering
-
- /**
- * have_enough_infos
- *
- * Allows PGBackend implementation to ensure that enough peers have
- * been contacted to satisfy its requirements.
- *
- * TODO: this interface should yield diagnostic info about which infos
- * are required
- */
- bool have_enough_infos(
- const map<epoch_t, pg_interval_t> &past_intervals, ///< [in] intervals
- const map<chunk_id_t, map<int, pg_info_t> > &peer_infos ///< [in] infos
- ) = 0; ///< @return true if we can continue peering
-
- /**
- * choose_acting
- *
- * Allows PGBackend implementation to select the acting set based on the
- * received infos
- *
- * @return False if the current acting set is inadequate, *req_acting will
- * be filled in with the requested new acting set. True if the
- * current acting set is adequate, *auth_log will be filled in
- * with the correct location of the authoritative log.
- */
- bool choose_acting(
- const map<int, pg_info_t> &peer_infos, ///< [in] received infos
- int *auth_log, ///< [out] osd with auth log
- vector<int> *req_acting ///< [out] requested acting set
- ) = 0;
-
- /// Scrub
-
- /// scan
- int scan(
- const hobject_t &start, ///< [in] scan objects >= start
- const hobject_t &up_to, ///< [in] scan objects < up_to
- vector<hobject_t> *out ///< [out] objects returned
- ) = 0; ///< @return 0 or error
-
- /// stat (TODO: ScrubMap::object needs to have PGBackend specific metadata)
- int scrub(
- const hobject_t &to_stat, ///< [in] object to stat
- bool deep, ///< [in] true if deep scrub
- ScrubMap::object *o ///< [out] result
- ) = 0; ///< @return 0 or error
-
- /**
- * compare_scrub_maps
- *
- * @param inconsistent [out] map of inconsistent pgs to pair<correct, incorrect>
- * @param errstr [out] stream of text about inconsistencies for user
- * perusal
- *
- * TODO: this interface doesn't actually make sense...
- */
- void compare_scrub_maps(
- const map<int, ScrubMap> &maps, ///< [in] maps to compare
- bool deep, ///< [in] true if scrub is deep
- map<hobject_t, pair<set<int>, set<int> > > *inconsistent,
- std:ostream *errstr
- ) = 0;
-
- /// Recovery
-
- /**
- * might_have_unrecoverable
- *
- * @param missing [in] missing,info gathered so far (must include acting)
- * @param intervals [in] past intervals
- * @param should_query [out] pair<int, cpg_t> shards to query
- */
- void might_have_unrecoverable(
- const map<chunk_id_t, map<int, pair<pg_info_t, pg_missing_t> > &missing,
- const map<epoch_t, pg_interval_t> &past_intervals,
- set<pair<int, cpg_t> > *should_query
- ) = 0;
-
- /**
- * might_have_unfound
- *
- * @param missing [in] missing,info gathered so far (must include acting)
- */
- bool recoverable(
- const map<chunk_id_t, map<int, pair<pg_info_t, pg_missing_t> > &missing,
- const hobject_t &hoid ///< [in] object to check
- ) = 0; ///< @return true if object can be recovered given missing
-
- /**
- * recover_object
- *
- * Triggers a recovery operation on the specified hobject_t
- * onreadable must be called before onwriteable
- *
- * @param missing [in] set of info, missing pairs for queried nodes
- */
- void recover_object(
- const hobject_t &hoid, ///< [in] object to recover
- const map<chunk_id_t, map<int, pair<pg_info_t, pg_missing_t> > &missing
- Context *onreadable, ///< [in] called when object can be read
- Context *onwriteable ///< [in] called when object can be written
- ) = 0;
-
- /// Backfill
-
- /// choose_backfill
- void choose_backfill(
- const map<chunk_id_t, map<int, pg_info_t> > &peer_infos ///< [in] infos
- const vector<int> &acting, ///< [in] acting set
- const vector<int> &up, ///< [in] up set
- set<int> *to_backfill ///< [out] osds to backfill
- ) = 0;
- };
diff --git a/doc/dev/osd_internals/erasure_coding/developer_notes.rst b/doc/dev/osd_internals/erasure_coding/developer_notes.rst
index 2bc796c67e5..454f087fe53 100644
--- a/doc/dev/osd_internals/erasure_coding/developer_notes.rst
+++ b/doc/dev/osd_internals/erasure_coding/developer_notes.rst
@@ -10,7 +10,7 @@ of the erasure code within Ceph. It is mostly based on examples being
explained to demonstrate how things work. It is written as if the
implementation is complete although it may not be the case. For
instance the plugin system and the jerasure plugin are implemented but
-the erasure code pool is not.
+the erasure coded pool is not.
Reading and writing encoded chunks from and to OSDs
---------------------------------------------------
@@ -18,8 +18,8 @@ Reading and writing encoded chunks from and to OSDs
An erasure coded pool stores each object as K+M chunks. It is divided
into K data chunks and M coding chunks. The pool is configured to have
a size of K+M so that each chunk is stored in an OSD in the acting
-set. The rank of the chunks is stored as `an attribute of the pool
-<http://tracker.ceph.com/issues/5862>`_ containing the object.
+set. The rank of the chunk is stored as `an attribute of the object
+<http://tracker.ceph.com/issues/5862>`_.
For instance an erasure coded pool is created to use five OSDs ( K+M =
5 ) and sustain the loss of two of them ( M = 2 ).
@@ -33,9 +33,9 @@ coding chunks : the fourth with *YXY* and the fifth with *GQC*. Each
chunk is stored in an OSD in the acting set. The chunks are stored in
objects that have the same name ( *NYAN* ) but reside on different
OSDs. The order in which the chunks were created must be preserved and
-is stored as an attribute of the pool containing the object. Chunk
-*1* contains *ABC* and is stored on *OSD5* while chunk *4* contains
-*XYY* and is stored on *OSD3*.
+is stored as an attribute of the object ( shard_t ), in addition to its
+name. Chunk *1* contains *ABC* and is stored on *OSD5* while chunk *4*
+contains *XYY* and is stored on *OSD3*.
::
@@ -56,7 +56,7 @@ is stored as an attribute of the pool containing the object. Chunk
+--v---+ +--v---+ +--v---+ +--v---+ +--v---+
name | NYAN | | NYAN | | NYAN | | NYAN | | NYAN |
+------+ +------+ +------+ +------+ +------+
- pool shard | 1 | | 2 | | 3 | | 4 | | 5 |
+ shard | 1 | | 2 | | 3 | | 4 | | 5 |
+------+ +------+ +------+ +------+ +------+
content | ABC | | DEF | | GHI | | YXY | | QGC |
+--+---+ +--+---+ +--+---+ +--+---+ +--+---+
@@ -85,10 +85,12 @@ When the object *NYAN* is read from the erasure coded pool, the
decoding function reads three chunks : chunk *1* containing *ABC*,
chunk *3* containing *GHI* and chunk *4* containing *YXY* and rebuild
the original content of the object *ABCDEFGHI*. The decoding function
-is informed that the chunks *2* and *5* are missing. The chunk *5*
-could not be read because the *OSD4* is *out*. The decoding function
-is called as soon as three chunks are read : *OSD2* was the slowest
-and its chunk was not taken into account.
+is informed that the chunks *2* and *5* are missing ( they are called
+*erasures* ). The chunk *5* could not be read because the *OSD4* is
+*out*. The decoding function can be called as soon as three chunks are
+read : *OSD2* was the slowest and its chunk was not taken into
+account.
+
::
+-------------------+
@@ -110,17 +112,17 @@ and its chunk was not taken into account.
+--+---+ +------+ +--+---+ +--+---+
name | NYAN | | NYAN | | NYAN | | NYAN |
+------+ +------+ +------+ +------+
- pool shard | 1 | | 2 | | 3 | | 4 |
+ shard | 1 | | 2 | | 3 | | 4 |
+------+ +------+ +------+ +------+
content | ABC | | DEF | | GHI | | YXY |
+--+---+ +--+---+ +--+---+ +--+---+
- ^ ^ ^ ^
- | | | |
- | | +--+---+ |
- | | | OSD1 | |
+ ^ . ^ ^
+ | TOO . | |
+ | SLOW . +--+---+ |
+ | ^ | OSD1 | |
| | +------+ |
| | +------+ |
- | SLOW +-------| OSD2 | |
+ | +-------| OSD2 | |
| +------+ |
| +------+ |
| | OSD3 |-----+
@@ -137,8 +139,9 @@ Interrupted full writes
In an erasure coded pool the primary OSD in the up set receives all
write operations. It is responsible for encoding the payload into K+M
-chunks and send them to the OSDs in the up set. It is also responsible
+chunks and sends them to the other OSDs. It is also responsible
for maintaining an authoritative version of the placement group logs.
+
::
primary
@@ -168,8 +171,8 @@ set of the placement group is made of *OSD 1*, *OSD 2* and *OSD 3*. An
object has been encoded and stored in the OSDs : the chunk D1v1
(i.e. Data chunk number 1 version 1) is on *OSD 1*, D2v1 on *OSD 2*
and C1v1 (i.e. Coding chunk number 1 version 1) on *OSD 3*. The
-placement group logs on each OSD are in sync at epoch 1 version 1
-(i.e. 1,1).
+placement group logs on each OSD are identical (i.e. 1,1).
+
::
primary
@@ -196,21 +199,23 @@ placement group logs on each OSD are in sync at epoch 1 version 1
+-----------+
*OSD 1* is the primary and receives a WRITE FULL from a client, which
-means the payload is to replace the object entirely instead of only
-overwriting a portion of it. Version two of the object is created
-to override version one. *OSD 1* encodes the payload into three
-chunks : D1v2 (i.e. Data chunk number 1 version 2) will be on *OSD 1*,
-D2v2 on *OSD 2* and C1v2 (i.e. Coding chunk number 1 version 2) on
-*OSD 3*. Each chunk is sent to the target OSD, including the primary
-OSD which is responsible for storing chunks in addition to handling
-write operations and maintaining an authoritative version of the
-placement group logs. When an OSD receives the message instructing it
-to write the chunk, it also creates a new entry in the placement group
-logs to reflect the change. For instance, as soon as *OSD 3* stores
-*C1v2*, it adds the entry 1,2 ( i.e. epoch 1, version 2 ) to its
-logs. Because the OSDs work asynchronously, some chunks may still be
-in flight ( such as *D2v2* ) while others are acknowledged and on disk
-( such as *C1v1* and *D1v1* ). ::
+means the payload is to replace the object entirely instead of
+overwriting a portion of it. Version two of the object is created to
+override version one. *OSD 1* encodes the payload into three chunks :
+D1v2 (i.e. Data chunk number 1 version 2) will be on *OSD 1*, D2v2 on
+*OSD 2* and C1v2 (i.e. Coding chunk number 1 version 2) on *OSD
+3*. Each chunk is sent to the target OSD, including the primary OSD
+which is responsible for storing chunks in addition to handling write
+operations and maintaining an authoritative version of the placement
+group logs. When an OSD receives the message instructing it to write
+the chunk, it also creates a new entry in the placement group logs to
+reflect the change. For instance, as soon as *OSD 3* stores *C1v2*, it
+adds the entry 1,2 ( i.e. epoch 1, version 2 ) to its logs. Because
+the OSDs work asynchronously, some chunks may still be in flight (
+such as *D2v2* ) while others are acknowledged and on disk ( such as
+*C1v1* and *D1v1* ).
+
+::
primary
+---OSD 1---+
@@ -243,6 +248,7 @@ acting set and the logs' *last_complete* pointer can move from
*1,1* to *1,2* and the files used to store the chunks of the previous
version of the object can be removed : *D1v1* on *OSD 1*, *D2v1* on
*OSD 2* and *C1v1* on *OSD 3*.
+
::
+---OSD 1---+
@@ -271,13 +277,14 @@ version of the object can be removed : *D1v1* on *OSD 1*, *D2v1* on
But accidents happen. If *OSD 1* goes down while *D2v2* is still in
flight, the object's version 2 is partially written : *OSD 3* has
-one chunk but does not have enough to recover. It lost two chunks :
-*D1v2* and *D2v2* but the erasure coding parameters K = 2 + M = 1
-requires that at least two chunks are available to rebuild the
+one chunk but that is no not enough to recover. It lost two chunks :
+*D1v2* and *D2v2* and the erasure coding parameters K = 2 + M = 1
+require that at least two chunks are available to rebuild the
third. *OSD 4* becomes the new primary and finds that the
*last_complete* log entry ( i.e. all objects before this entry were
known to be available on all OSDs in the previous acting set ) is
-*1,1* and will be the head of the new authoritative log.
+*1,1* and that will be the head of the new authoritative log.
+
::
+---OSD 2---+
@@ -299,6 +306,7 @@ known to be available on all OSDs in the previous acting set ) is
The log entry *1,2* found on *OSD 3* is divergent from the new
authoritative log provided by *OSD 4* : it is discarded and the file
containing the *C1v2* chunk is removed.
+
::
+---OSD 2---+
@@ -323,14 +331,14 @@ coding library during scrubbing and stored on the new primary *OSD 4*.
Interrupted append
------------------
-An object is coded in stripes, either because they are too big or
-because they are created with multiple operations instead of a single
-full write. A single stripe will exist/exists in the case of a full
-write, assuming the object size is not too large to encode in memory.
-When appending to an existing object, the stripe size is retrieved
-from the attributes of the object. It applies, for instance, when
-*rgw* writes an object with sequence of append instead of a single
-write. ::
+An object is coded in stripes, either because it is too big or because
+it is created with multiple write operations instead of a single full
+write. When appending to an existing object, the stripe size is
+retrieved from the attributes of the object. It applies, for instance,
+when *rgw* writes an object with a sequence of appends instead of a
+single full write.
+
+::
primary
+---OSD 1---+
@@ -354,7 +362,7 @@ write. ::
+-----------+
*OSD 1* is the primary and receives an APPEND from a client, meaning
-the payload is to be appended at the end of the object. *OSD 1*
+the payload is to be appended to the end of the object. *OSD 1*
encodes the payload into three chunks : S2D1 (i.e. Stripe two data
chunk number 1 ) will be in s1 ( shard 1 ) on *OSD 1*, S2D2 in s2 on
*OSD 2* and S2C1 (i.e. Stripe two coding chunk number 1 ) in s3 on
@@ -368,8 +376,8 @@ logs to reflect the change. For instance, as soon as *OSD 3* stores
logs. The log entry also carries the nature of the operation: in this
case 1,2 is an APPEND where 1,1 was a CREATE. Because the OSDs work
asynchronously, some chunks may still be in flight ( such as *S2D2* )
-while others are acknowledged and on disk ( such as *S2D1* and *S2C1*
-).
+while others are acknowledged and on disk (such as *S2D1* and *S2C1*).
+
::
+---OSD 1---+
@@ -396,14 +404,16 @@ while others are acknowledged and on disk ( such as *S2D1* and *S2C1*
+-----------+
If *OSD 1* goes down while *S2D2* is still in flight, the payload is
-partially appended : s3 ( shard 3) in *OSD 3* has one chunk but does
-not have enough to recover because s1 and s2 don't have it. Two chunks
-were lost (*S2D1* and S2D2) but the erasure coding parameters K = 2 +
-M = 1 requires that at least two chunks are available to rebuild the
-third. *OSD 4* becomes the new primary and finds that the
-*last_complete* log entry ( i.e. all objects before this entry were
-known to be available on all OSDs in the previous acting set ) is
-*1,1* and will be the head of the new authoritative log. ::
+partially appended : s3 (shard 3) in *OSD 3* has one chunk but does
+not have enough to recover. Two chunks were lost (*S2D1* and S2D2) but
+the erasure coding parameters K = 2 + M = 1 requires that at least two
+chunks are available to rebuild the third. *OSD 4* becomes the new
+primary and finds that the *last_complete* log entry ( i.e. all
+objects before this entry were known to be available on all OSDs in
+the previous acting set ) is *1,1* and will be the head of the new
+authoritative log.
+
+::
+---OSD 2---+
|+-s2-+ log |
@@ -429,8 +439,6 @@ the stripe size.
Erasure code library
--------------------
-See also `the corresponding tracker issue <http://tracker.ceph.com/issues/5877>`_
-
Using `Reed-Solomon <https://en.wikipedia.org/wiki/Reed_Solomon>`_,
with parameters K+M, object O is encoded by dividing it into chunks O1,
O2, ... OM and computing coding chunks P1, P2, ... PK. Any K chunks
@@ -443,8 +451,8 @@ Reading the original content of object O could be a simple
concatenation of O1, O2, ... OM, because the plugins are using
`systematic codes
<http://en.wikipedia.org/wiki/Systematic_code>`_. Otherwise the chunks
-must be given to the erasure code library to retrieve the content of
-the object.
+must be given to the erasure code library *decode* method to retrieve
+the content of the object.
Reed-Solomon is significantly more expensive to encode than fountain
codes with the current `jerasure implementation
@@ -462,10 +470,11 @@ functions ( for Cauchy or Liberation for instance ): smaller packets
means more calls and more overhead.
Although Reed-Solomon is provided as a default, Ceph uses it via an
-`abstract API <http://tracker.ceph.com/issues/5878>`_ designed to
+`abstract API <https://github.com/ceph/ceph/blob/08a97ae45f4df58a6a8ea8a6400934d860cf5eb4/src/osd/ErasureCodeInterface.h>`_ designed to
allow each pool to choose the plugin that implements it using
`key=value pairs when creating the pool
-<http://tracker.ceph.com/issues/6113>`_.
+<https://github.com/ceph/ceph/blob/08a97ae45f4df58a6a8ea8a6400934d860cf5eb4/src/mon/MonCommands.h#L483>`_.
+
::
ceph osd pool create <pool> \
@@ -473,18 +482,21 @@ allow each pool to choose the plugin that implements it using
erasure-code-plugin=<plugin>
The *<plugin>* is dynamically loaded from *<dir>* (defaults to
-*/usr/lib/ceph/erasure-code* ) and expected to implement the
-*int __erasure_code_init(char *plugin_name)* function
-which is responsible for registering an object derived from
-*ErasureCodePlugin* in the registry :
+*/usr/lib/ceph/erasure-code* ) and expected to implement the *int
+__erasure_code_init(char *plugin_name)* function which is responsible
+for registering an object derived from *ErasureCodePlugin* in the
+registry. The `ErasureCodePluginExample <https://github.com/ceph/ceph/blob/08a97ae45f4df58a6a8ea8a6400934d860cf5eb4/src/test/osd/ErasureCodePluginExample.cc#L32>`_ plugin reads:
+
::
- ErasureCodePluginRegistry::add(plugin_name,
- new ErasureCodePluginExample());
+ ErasureCodePluginRegistry &instance =
+ ErasureCodePluginRegistry::instance();
+ instance.add(plugin_name, new ErasureCodePluginExample());
The *ErasureCodePlugin* derived object must provide a factory method
from which the concrete implementation of the *ErasureCodeInterface*
-object can be generated:
+object can be generated. The `ErasureCodePluginExample plugin <https://github.com/ceph/ceph/blob/08a97ae45f4df58a6a8ea8a6400934d860cf5eb4/src/test/osd/ErasureCodePluginExample.cc#L22>`_ reads:
+
::
virtual int factory(const map<std::string,std::string> &parameters,
@@ -493,39 +505,23 @@ object can be generated:
return 0;
}
-The *parameters* is the list of *key=value* pairs that were set when the pool
-was created. Each *key* must be prefixed with erasure-code to avoid name collisions
+The *parameters* argument is the list of *key=value* pairs that were
+set when the pool was created. Each *key* must be prefixed with
+*erasure-code* to avoid name collisions:
+
::
- ceph osd pool create <pool> \
+ ceph osd pool create poolname 123 \
erasure-code-directory=<dir> \ # mandatory
erasure-code-plugin=jerasure \ # mandatory
erasure-code-m=10 \ # optional and plugin dependant
erasure-code-k=3 \ # optional and plugin dependant
erasure-code-technique=reed_sol_van \ # optional and plugin dependant
-Erasure code jerasure plugin
-----------------------------
-
-The parameters interpreted by the jerasure plugin are:
-::
-
- ceph osd pool create <pool> \
- erasure-code-directory=<dir> \ # plugin directory absolute path
- erasure-code-plugin=jerasure \ # plugin name (only jerasure)
- erasure-code-k=<k> \ # data chunks (default 2)
- erasure-code-m=<m> \ # coding chunks (default 2)
- erasure-code-technique=<technique> \ # coding technique
-
-The coding techniques can be chosen among *reed_sol_van*,
-*reed_sol_r6_op*, *cauchy_orig*, *cauchy_good*, *liberation*,
-*blaum_roth* and *liber8tion*.
-
Scrubbing
---------
See also `Refactor scrub to use PGBackend methods <http://tracker.ceph.com/issues/5861>`_
-
The simplest form of scrubbing is to check with each OSDs holding a
chunk if it exists locally. If more thank M chunks are missing the
object is marked as lost. If up to M chunks are missing they are
@@ -547,13 +543,6 @@ built-in on a per block basis.
Notes
-----
-This document is a description of how erasure coding could be
-implemented, it does not reflect the current state of the code
-base. Possible optimizations are mentionned where relevant but the
-first implementation should not include any of them: they are
-presented to show that there is a path toward optimization starting
-from simple minded implementation.
-
If the objects are large, it may be impractical to encode and decode
them in memory. However, when using *RBD* a 1TB device is divided in
many individual 4MB objects and *RGW* does the same.
@@ -561,73 +550,3 @@ many individual 4MB objects and *RGW* does the same.
Encoding and decoding is implemented in the OSD. Although it could be
implemented client side for read write, the OSD must be able to encode
and decode on its own when scrubbing.
-
-If a partial read is required, an optimization could be to only fetch
-the chunk that contains the data instead of always fetching all
-chunks. For instance if *H* is required in the example above, chunk 3
-is read if available. Reading 3 chunks is a fallback in case chunk 3 is
-not available.
-
-Partial reads and writes
-------------------------
-
-If an object is large, reading or writing all of it when changing only
-a few bytes is expensive. It is more efficient to only read or write a
-subset of the object. When a client writes on an existing object, it
-can provide the offset and the length of the write as well as the
-payload with the `CEPH_OSD_OP_WRITE
-<https://github.com/ceph/ceph/blob/962b64a83037ff79855c5261325de0cd1541f582/src/osd/ReplicatedPG.cc#L2542>`_
-operation. It is refered to as *partial write* and is different from
-the `CEPH_OSD_OP_WRITEFULL operation
-<https://github.com/ceph/ceph/blob/962b64a83037ff79855c5261325de0cd1541f582/src/osd/ReplicatedPG.cc#L2552>`_
-which writes the entire object at once.
-
-When using replicas for partial writes or reads, the primary OSD
-translates them into read(2) and write(2) POSIX system calls. When
-writing, it then forwards the CEPH_OSD_OP_WRITE message to the
-replicas and waits for them to acknowledge they are done.
-
-When reading erasure coded objects, at least M chunks must be read and
-decoded to extract the desired bytes. If a `systematic code
-<https://en.wikipedia.org/wiki/Systematic_code>`_ is used ( i.e. the
-data chunks are readable by simple concatenation ) read can be
-optimized to use the chunk containing the desired bytes and rely on
-the erasure decoding function only if a chunk is missing.
-
-When writing an erasure coded object, changing even one byte requires
-that it is encoded again in full.
-
-If Ceph is only used thru the *radosgw* or *librbd*, objects will mostly
-have the same size. The *radosgw* user may upload a 1GB object, which will
-be divided into smaller 4MB objects behind the scene ( or whatever is
-set with *rgw obj stripe size* ). If a KVM is attached a 10GB RBD block
-device, it will also be divided into smaller 4BM objects ( or whatever
-size is given to the --stripe-unit argument when creating the RBD
-block ). In both cases, writing one byte at the beginning will only
-require to encode the first object and not all of them.
-
-Objects can be further divided into stripes to reduce the overhead of
-partial writes. For instance:
-::
-
- +-----------------------+
- |+---------------------+|
- || stripe 0 ||
- || [0,N) ||
- |+---------------------+|
- |+---------------------+|
- || stripe 1 ||
- || [N,N*2) ||
- |+---------------------+|
- |+---------------------+|
- || stripe 3 [N*2,len) ||
- |+---------------------+|
- +-----------------------+
- object of size len
-
-Each stripe is encoded independantly and the same OSDs are used for
-all of them. For instance, if stripe 0 is encoded into 3 chunks on
-OSDs 5, 8 and 9, stripe 1 is also encoded into 3 chunks on the same
-OSDs. The size of a stripe is stored as an attribute of the object.
-When writing one byte at offset N, instead of re-encoding the whole
-object it is enough to re-encode the stripe that contains it.
diff --git a/doc/dev/osd_internals/erasure_coding/jerasure.rst b/doc/dev/osd_internals/erasure_coding/jerasure.rst
new file mode 100644
index 00000000000..312eac52e5d
--- /dev/null
+++ b/doc/dev/osd_internals/erasure_coding/jerasure.rst
@@ -0,0 +1,22 @@
+===============
+jerasure plugin
+===============
+
+Introduction
+------------
+
+The parameters interpreted by the jerasure plugin are:
+
+::
+
+ ceph osd pool create <pool> \
+ erasure-code-directory=<dir> \ # plugin directory absolute path
+ erasure-code-plugin=jerasure \ # plugin name (only jerasure)
+ erasure-code-k=<k> \ # data chunks (default 2)
+ erasure-code-m=<m> \ # coding chunks (default 2)
+ erasure-code-technique=<technique> \ # coding technique
+
+The coding techniques can be chosen among *reed_sol_van*,
+*reed_sol_r6_op*, *cauchy_orig*, *cauchy_good*, *liberation*,
+*blaum_roth* and *liber8tion*.
+
diff --git a/doc/dev/osd_internals/erasure_coding/pgbackend.rst b/doc/dev/osd_internals/erasure_coding/pgbackend.rst
index c16354f5116..43415ba4f7e 100644
--- a/doc/dev/osd_internals/erasure_coding/pgbackend.rst
+++ b/doc/dev/osd_internals/erasure_coding/pgbackend.rst
@@ -2,14 +2,13 @@
PG Backend Proposal
===================
-See also `PGBackend.h <../PGBackend-h>`_
-
Motivation
----------
-The purpose of the PG Backend interface is to abstract over the
-differences between replication and erasure coding as failure recovery
-mechanisms.
+The purpose of the `PG Backend interface
+<https://github.com/ceph/ceph/blob/a287167cf8625165249b7636540591aefc0a693d/src/osd/PGBackend.h>`_
+is to abstract over the differences between replication and erasure
+coding as failure recovery mechanisms.
Much of the existing PG logic, particularly that for dealing with
peering, will be common to each. With both schemes, a log of recent
@@ -34,12 +33,12 @@ and erasure coding which PGBackend must abstract over:
positions are not interchangeable. In particular, it might make
sense for a single OSD to hold more than 1 PG copy for different
acting set positions.
-5. Selection of a pgtemp for backfill may difer between replicated
+5. Selection of a pgtemp for backfill may differ between replicated
and erasure coded backends.
6. The set of necessary osds from a particular interval required to
- to continue peering may difer between replicated and erasure
+ to continue peering may differ between replicated and erasure
coded backends.
-7. The selection of the authoritative log may difer between replicated
+7. The selection of the authoritative log may differ between replicated
and erasure coded backends.
Client Writes
@@ -78,8 +77,9 @@ Core Changes:
- Current code should be adapted to use and rollback as appropriate
APPEND, DELETE, (SET|RM)ATTR log entries.
- The filestore needs to be able to deal with multiply versioned
- hobjects. This probably means adapting the filestore internally to
- use a ghobject which is basically a tuple<hobject_t, gen_t,
+ hobjects. This means adapting the filestore internally to
+ use a `ghobject <https://github.com/ceph/ceph/blob/aba6efda13eb6ab4b96930e9cc2dbddebbe03f26/src/common/hobject.h#L193>`_
+ which is basically a tuple<hobject_t, gen_t,
shard_t>. The gen_t + shard_t need to be included in the on-disk
filename. gen_t is a unique object identifier to make sure there
are no name collisions when object N is created +
@@ -114,7 +114,7 @@ divergent objects. Thus, we must choose the *oldest* last_update from
the last interval which went active in order to minimize the number of
divergent objects.
-The dificulty is that the current code assumes that as long as it has
+The difficulty is that the current code assumes that as long as it has
an info from at least 1 osd from the prior interval, it can complete
peering. In order to ensure that we do not end up with an
unrecoverably divergent object, a K+M erasure coded PG must hear from at
@@ -161,7 +161,7 @@ Client Reads
------------
Reads with the replicated strategy can always be satisfied
-syncronously out of the primary osd. With an erasure coded strategy,
+synchronously out of the primary osd. With an erasure coded strategy,
the primary will need to request data from some number of replicas in
order to satisfy a read. The perform_read() interface for PGBackend
therefore will be async.
@@ -192,7 +192,7 @@ include the chunk id in the object key.
Core changes:
- The filestore `ghobject_t needs to also include a chunk id
- <http://tracker.ceph.com/issues/5862>`_ making it more like
+ <https://github.com/ceph/ceph/blob/aba6efda13eb6ab4b96930e9cc2dbddebbe03f26/src/common/hobject.h#L193>`_ making it more like
tuple<hobject_t, gen_t, shard_t>.
- coll_t needs to include a shard_t.
- The `OSD pg_map and similar pg mappings need to work in terms of a
@@ -260,7 +260,7 @@ Core changes:
Recovery
--------
-See `Issue #5857`_. The logic for recovering an object depends on the backend. With
+The logic for recovering an object depends on the backend. With
the current replicated strategy, we first pull the object replica
to the primary and then concurrently push it out to the replicas.
With the erasure coded strategy, we probably want to read the
@@ -270,7 +270,7 @@ and push out the replacement chunks concurrently.
Another difference is that objects in erasure coded pg may be
unrecoverable without being unfound. The "unfound" concept
should probably then be renamed to unrecoverable. Also, the
-PGBackend impementation will have to be able to direct the search
+PGBackend implementation will have to be able to direct the search
for pg replicas with unrecoverable object chunks and to be able
to determine whether a particular object is recoverable.
@@ -281,9 +281,11 @@ Core changes:
PGBackend interfaces:
-- might_have_unrecoverable()
-- recoverable()
-- recover_object()
+- `on_local_recover_start <https://github.com/ceph/ceph/blob/a287167cf8625165249b7636540591aefc0a693d/src/osd/PGBackend.h#L46>`_
+- `on_local_recover <https://github.com/ceph/ceph/blob/a287167cf8625165249b7636540591aefc0a693d/src/osd/PGBackend.h#L52>`_
+- `on_global_recover <https://github.com/ceph/ceph/blob/a287167cf8625165249b7636540591aefc0a693d/src/osd/PGBackend.h#L64>`_
+- `on_peer_recover <https://github.com/ceph/ceph/blob/a287167cf8625165249b7636540591aefc0a693d/src/osd/PGBackend.h#L69>`_
+- `begin_peer_recover <https://github.com/ceph/ceph/blob/a287167cf8625165249b7636540591aefc0a693d/src/osd/PGBackend.h#L76>`_
Backfill
--------
@@ -316,6 +318,4 @@ PGBackend interfaces:
- choose_backfill(): allows the implementation to determine which osds
should be backfilled in a particular interval.
-
-.. _Issue #5857: http://tracker.ceph.com/issues/5857
-.. _Issue #5856: http://tracker.ceph.com/issues/5856 \ No newline at end of file
+.. _Issue #5856: http://tracker.ceph.com/issues/5856
diff --git a/doc/index.rst b/doc/index.rst
index 8bf5340b2f6..4068be599e5 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -90,6 +90,7 @@ about Ceph, see our `Architecture`_ section.
:maxdepth: 1
:hidden:
+ start/intro
start/index
install/index
rados/index
diff --git a/doc/install/index.rst b/doc/install/index.rst
index 347b6ae9ac2..3be09c5d0df 100644
--- a/doc/install/index.rst
+++ b/doc/install/index.rst
@@ -1,50 +1,54 @@
-==============
- Installation
-==============
-
-The Ceph Object Store is the foundation of all Ceph clusters, and it consists
-primarily of two types of daemons: Object Storage Daemons (OSDs) and monitors.
-The Ceph Object Store is based upon the concept of
-:abbr:`RADOS (Reliable Autonomic Distributed Object Store)`, which eliminates
-single points of failure and delivers infinite scalability. For details on
-the architecture of Ceph and RADOS, refer to `Ceph Architecture`_. All Ceph
-deployments have OSDs and monitors, so you should prepare your Ceph cluster
-by focusing first on the object storage cluster.
+=======================
+ Installation (Manual)
+=======================
.. raw:: html
- <table cellpadding="10"><colgroup><col width="33%"><col width="33%"><col width="33%"></colgroup><tbody valign="top"><tr><td><h3>Recommendations</h3>
-
-To begin using Ceph in production, you should review our hardware
-recommendations and operating system recommendations. Many of the
-frequently-asked questions in our mailing list involve hardware-related
-questions and how to install Ceph on various distributions.
+ <table><colgroup><col width="50%"><col width="50%"></colgroup><tbody valign="top"><tr><td><h3>Advanced Package Tool (APT)</h3>
+
+If you are deploying a Ceph cluster on Debian or Ubuntu distributions,
+use the instructions below to install packages manually.
.. toctree::
:maxdepth: 2
- Hardware Recommendations <hardware-recommendations>
- OS Recommendations <os-recommendations>
-
-.. raw:: html
+ Installing Debian/Ubuntu Packages <debian>
+ Installing on Calxeda Hardware <calxeda>
+ Installing QEMU <qemu-deb>
+ Installing libvirt <libvirt-deb>
- </td><td><h3>Installation</h3>
+.. raw:: html
-If you are deploying a Ceph cluster (that is, not developing Ceph),
-install Ceph using our stable release packages. For testing, you
-may install development release and testing packages.
+ </td><td><h3>Redhat Package Manager (RPM) / Yellowdog Updater, Modified (YUM) </h3>
+
+If you are deploying a Ceph cluster on Red Hat(rhel6), CentOS (el6), Fedora
+17-19 (f17-f19), OpenSUSE 12 (opensuse12), and SLES (sles11) distributions, use
+the instructions below to install packages manually.
.. toctree::
:maxdepth: 2
- Installing Debian/Ubuntu Packages <debian>
Installing RPM Packages <rpm>
- Installing on Calxeda <calxeda>
+ Installing YUM Priorities <yum-priorities>
+ Installing QEMU <qemu-rpm>
+ Installing libvirt <libvirt-rpm>
+
+.. raw:: html
+
+ </td></tr><tr><td><h3>Upgrading Ceph</h3>
+
+If you are upgrading Ceph from a previous release, please read the the upgrade
+documentation to ensure that you follow the proper upgrade sequence.
+
+.. toctree::
+ :maxdepth: 2
+
Upgrading Ceph <upgrading-ceph>
+
-.. raw:: html
+.. raw:: html
- </td><td><h3>Building Ceph from Source</h3>
+ </td><td><h3>Building Ceph</h3>
You can build Ceph from source by downloading a release or cloning the ``ceph``
repository at github. If you intend to build Ceph from source, please see the
@@ -63,9 +67,10 @@ will save you time.
Build a Package <build-packages>
Contributing Code <contributing>
+See the `Development`_ section for additional development details.
.. raw:: html
</td></tr></tbody></table>
-
-.. _Ceph Architecture: ../architecture/
+
+.. _Development: ../../dev \ No newline at end of file
diff --git a/doc/install/libvirt-deb.rst b/doc/install/libvirt-deb.rst
new file mode 100644
index 00000000000..9365e46c747
--- /dev/null
+++ b/doc/install/libvirt-deb.rst
@@ -0,0 +1,43 @@
+====================
+ Installing libvirt
+====================
+
+
+Prerequisites
+=============
+
+- `Install`_ and `configure`_ a Ceph Storage Cluster
+- `Install and configure`_ QEMU/KVM
+
+
+Installing ``libvirt`` on Ubuntu 12.04 Precise
+==============================================
+
+``libvirt`` packages are incorporated into the Ubuntu 12.04 precise
+distribution. To install ``libvirt`` on precise, execute the following::
+
+ sudo apt-get update && sudo apt-get install libvirt-bin
+
+
+Installing ``libvirt`` on Earlier Versions of Ubuntu
+====================================================
+
+For Ubuntu distributions 11.10 oneiric and earlier, you must build ``libvirt``
+from source. Clone the ``libvirt`` repository, and use `AutoGen`_ to generate
+the build. Then, execute ``make`` and ``make install`` to complete the
+installation. For example::
+
+ git clone git://libvirt.org/libvirt.git
+ cd libvirt
+ ./autogen.sh
+ make
+ sudo make install
+
+See `libvirt Installation`_ for details.
+
+
+.. _libvirt Installation: http://www.libvirt.org/compiling.html
+.. _AutoGen: http://www.gnu.org/software/autogen/
+.. _Install: ../index
+.. _configure: ../../rados/configuration
+.. _Install and configure: ../../rbd/qemu-rbd
diff --git a/doc/install/libvirt-rpm.rst b/doc/install/libvirt-rpm.rst
new file mode 100644
index 00000000000..a94c6e8ae12
--- /dev/null
+++ b/doc/install/libvirt-rpm.rst
@@ -0,0 +1,19 @@
+====================
+ Installing libvirt
+====================
+
+To use ``libvirt`` with a Ceph Storage Cluster, you must
+have a running Ceph Storage Cluster. You must also install QEMU.
+See `Installing QEMU`_ for details.
+
+
+``libvirt`` packages are incorporated into the recent CentOS/RHEL distributions.
+To install ``libvirt``, execute the following::
+
+ sudo yum install libvirt
+
+See `libvirt Installation`_ for details.
+
+
+.. _libvirt Installation: http://www.libvirt.org/compiling.html
+.. _Installing QEMU: ../qemu-rpm \ No newline at end of file
diff --git a/doc/install/qemu-deb.rst b/doc/install/qemu-deb.rst
new file mode 100644
index 00000000000..29abeafa3bc
--- /dev/null
+++ b/doc/install/qemu-deb.rst
@@ -0,0 +1,26 @@
+=================
+ Installing QEMU
+=================
+
+
+
+Installing QEMU (12.04 Precise and later)
+=========================================
+
+QEMU packages are incorporated into Ubuntu 12.04 Precise Pangolin and later
+versions. To install QEMU, execute the following::
+
+ sudo apt-get install qemu
+
+Installing QEMU (11.10 Oneric and earlier)
+==========================================
+
+For Ubuntu distributions 11.10 Oneiric and earlier, you must install
+the 0.15 version of QEMU or later. To build QEMU from source, use the
+following procedure::
+
+ cd {your-development-directory}
+ git clone git://git.qemu.org/qemu.git
+ cd qemu
+ ./configure --enable-rbd
+ make; make install
diff --git a/doc/install/qemu-rpm.rst b/doc/install/qemu-rpm.rst
new file mode 100644
index 00000000000..67da2c3714c
--- /dev/null
+++ b/doc/install/qemu-rpm.rst
@@ -0,0 +1,56 @@
+=================
+ Installing QEMU
+=================
+
+To install QEMU with ``yum``, you must ensure that you have
+``yum-plugin-priorities`` installed. See `Installing YUM Priorities`_
+for details.
+
+To install QEMU, execute the following:
+
+#. Create a ``/etc/yum.repos.d/ceph-qemu.conf`` file with the following
+ contents::
+
+ [ceph-qemu]
+ name=Ceph Packages for QEMU
+ baseurl=http://ceph.com/packages/ceph-extras/rpm/centos6.3/$basearch
+ enabled=1
+ priority=2
+ gpgcheck=1
+ type=rpm-md
+ gpgkey=https://ceph.com/git/?p=ceph.git;a=blob_plain;f=keys/release.asc
+
+ [ceph-qemu-noarch]
+ name=Ceph QEMU noarch
+ baseurl=http://ceph.com/packages/ceph-extras/rpm/centos6.3/noarch
+ enabled=1
+ priority=2
+ gpgcheck=1
+ type=rpm-md
+ gpgkey=https://ceph.com/git/?p=ceph.git;a=blob_plain;f=keys/release.asc
+
+ [ceph-qemu-source]
+ name=Ceph QEMU Sources
+ baseurl=http://ceph.com/packages/ceph-extras/rpm/centos6.3/SRPMS
+ enabled=1
+ priority=2
+ gpgcheck=1
+ type=rpm-md
+ gpgkey=https://ceph.com/git/?p=ceph.git;a=blob_plain;f=keys/release.asc
+
+#. Update your repositories. ::
+
+ sudo yum update
+
+#. Install QEMU for Ceph. ::
+
+ sudo yum install qemu-kvm qemu-kvm-tools qemu-img
+
+#. Install additional QEMU packages (optional)::
+
+ sudo yum install qemu-guest-agent qemu-guest-agent-win32
+
+See `QEMU and Block Devices`_ for usage.
+
+.. _QEMU and Block Devices: ../../rbd/qemu-rbd
+.. _Installing YUM Priorities: ../yum-priorities \ No newline at end of file
diff --git a/doc/install/rpm.rst b/doc/install/rpm.rst
index ea96d394c7a..9e8cdcd003c 100644
--- a/doc/install/rpm.rst
+++ b/doc/install/rpm.rst
@@ -7,6 +7,7 @@ development release packages (for the latest features), or development
testing packages (for development and QA only). Do not add multiple
package sources at the same time.
+
Install Release Key
===================
@@ -139,142 +140,54 @@ You can download the RPMs directly from::
-Installing Ceph Deploy
-======================
-
-Once you have added either release or development packages to ``yum``, you
-can install ``ceph-deploy``. ::
-
- sudo yum install ceph-deploy python-pushy
-
-
-
-Installing Ceph Packages
-========================
-
-Once you have added either release or development packages to ``yum``, you
-can install Ceph packages. You can also use ``ceph-deploy`` to install Ceph
-packages. ::
-
- sudo yum install ceph
-
-
-
-Installing Ceph Object Storage
-==============================
-
-:term:`Ceph Object Storage` runs on Apache and FastCGI in conjunction with the
-:term:`Ceph Storage Cluster`.
-
-#. Install Apache and FastCGI. ::
-
- rpm -ivh fcgi-2.4.0-10.el6.x86_64.rpm
- rpm -ivh mod_fastcgi-2.4.6-2.el6.rf.x86_64.rpm
-
-
-#. Install the Ceph Object Storage daemon. ::
+Adding Ceph to YUM
+==================
- yum install ceph-radosgw
+You may also add Ceph to the ``/etc/yum.repos.d`` directory. Create a
+``ceph.repo`` file. In the example below, replace ``{ceph-stable}`` with
+a stable release of Ceph (e.g., ``cuttlefish``, ``dumpling``, etc.) and
+``{distro}`` with your Linux distribution (e.g., ``el6``, ``rhel6``, etc.). ::
+ [ceph]
+ name=Ceph packages for $basearch
+ baseurl=http://ceph.com/rpm-{ceph-stable}/{distro}/$basearch
+ enabled=1
+ gpgcheck=1
+ type=rpm-md
+ gpgkey=https://ceph.com/git/?p=ceph.git;a=blob_plain;f=keys/release.asc
-#. Add the following lines to your Ceph configuration file.
+ [ceph-noarch]
+ name=Ceph noarch packages
+ baseurl=http://ceph.com/rpm-{ceph-stable}/{distro}/noarch
+ enabled=1
+ gpgcheck=1
+ type=rpm-md
+ gpgkey=https://ceph.com/git/?p=ceph.git;a=blob_plain;f=keys/release.asc
-.. code-block:: ini
+ [ceph-source]
+ name=Ceph source packages
+ baseurl=http://ceph.com/rpm-{ceph-stable}/{distro}/SRPMS
+ enabled=0
+ gpgcheck=1
+ type=rpm-md
+ gpgkey=https://ceph.com/git/?p=ceph.git;a=blob_plain;f=keys/release.asc
- [client.radosgw.gateway]
- host = {fqdn}
- keyring = /etc/ceph/keyring.radosgw.gateway
- rgw socket path = /tmp/radosgw.sock
- log file = /var/log/ceph/radosgw.log
- rgw print continue = false
-
-.. note:: Replace ``{fqdn}`` with the output from ``hostname``. This is
- important. Debian systems use the simple hostname, but on CentOS 6/RHEL 6
- you must use the fully qualified domain name.
-
-#. Create a data directory. ::
-
- mkdir -p /var/lib/ceph/radosgw/ceph-radosgw.gateway
-
-
-#. Change ``httpd ServerName`` in ``/etc/httpd/conf/httpd.conf``. ::
-
- ServerName {FQDN}
-
-
-#. Create an Apache httpd virtual host in ``/etc/httpd/conf.d/rgw.conf``. ::
-
- FastCgiExternalServer /var/www/s3gw.fcgi -socket /tmp/radosgw.sock
- <VirtualHost *:80>
- ServerName <FQDN of the host>
- ServerAdmin root@localhost
- DocumentRoot /var/www
- RewriteEngine On
- RewriteRule ^/([a-zA-Z0-9-_.]*)([/]?.*) /s3gw.fcgi?page=$1&params=$2&%{QUERY_STRING} [E=HTTP_AUTHORIZATION:%{HTTP:Authorization},L]
- <IfModule mod_fastcgi.c>
- <Directory /var/www>
- Options +ExecCGI
- AllowOverride All
- SetHandler fastcgi-script
- Order allow,deny
- Allow from all
- AuthBasicAuthoritative Off
- </Directory>
- </IfModule>
- AllowEncodedSlashes On
- ErrorLog /var/log/httpd/error.log
- CustomLog /var/log/httpd/access.log combined
- ServerSignature Off
- </VirtualHost>
-
-#. Turn off ``fastcgiwrapper`` in ``/etc/httpd/conf.d/fastcgi.conf`` by
- commenting out the following line::
-
- #FastCgiWrapper On
-
-
-#. Add a ``fastcgi`` script with the following path ``/var/www/s3gw.fcgi``. ::
-
- #!/bin/sh
- exec /usr/bin/radosgw -c /etc/ceph/ceph.conf -n client.radosgw.gateway
-
-
-#. Make ``s3gw.fcgi`` executable::
-
- chmod +x /var/www/s3gw.fcgi
-
-
-#. Create a user key. ::
-
- ceph-authtool -C -n client.radosgw.gateway --gen-key /etc/ceph/keyring.radosgw.gateway
- ceph-authtool -n client.radosgw.gateway --cap mon 'allow rw' --cap osd 'allow rwx' /etc/ceph/keyring.radosgw.gateway
- ceph auth add client.radosgw.gateway --in-file=/etc/ceph/keyring.radosgw.gateway
-
-
-#. Please make sure ``/etc/ceph/keyring.radosgw.gateway`` file and
- ``/var/log/ceph/radosgw.log`` are accessible by the ``apache`` user. ::
-
- sudo chown apache:apache /etc/ceph/keyring.radosgw.gateway
- sudo chown apache:apache /var/log/ceph/radosgw.log
-
-.. note:: This is important. The user is ``root`` for Debian.
+Installing Ceph Deploy
+======================
-#. Create ``.rgw.buckets`` and add it to the Ceph Object Storage daemon. ::
+Once you have added either release or development packages, or added a
+``ceph.repo`` file to ``/etc/yum.repos.d``, you can install ``ceph-deploy``. ::
- rados mkpool .rgw.buckets
- radosgw-admin pool add --pool .rgw.buckets
+ sudo yum install ceph-deploy python-pushy
-#. Configure Apache and the Ceph Object Storage daemon to start on boot. ::
- chkconfig httpd on
- chkconfig ceph-radosgw on
+Installing Ceph Packages
+========================
-#. Start the services. ::
+Once you have added either release or development packages, or added a
+``ceph.repo`` file to ``/etc/yum.repos.d``, you can install Ceph packages. ::
- /etc/init.d/httpd start
- /etc/init.d/ceph-radosgw start
-
-See `Ceph Object Storage`_ for additional details.
+ sudo yum install ceph
-.. _Ceph Object Storage: ../../radosgw
+.. note:: You can also use ``ceph-deploy`` to install Ceph packages.
diff --git a/doc/install/yum-priorities.rst b/doc/install/yum-priorities.rst
new file mode 100644
index 00000000000..e4adb72b7dd
--- /dev/null
+++ b/doc/install/yum-priorities.rst
@@ -0,0 +1,20 @@
+===========================
+ Installing YUM Priorities
+===========================
+
+Ceph builds packages for Apache and FastCGI (for 100-continue support) and
+QEMU (for ``rbd`` support). You must set priorities in your ``.repo``
+files to ensure that ``yum`` installs the Ceph packages instead of the
+standard packages. The ``priorities`` setting requires you to install
+and enable ``yum-plugin-priorities``.
+
+#. Install ``yum-plugin-priorities``. ::
+
+ sudo yum install yum-plugin-priorities
+
+#. Ensure ``/etc/yum/pluginconf.d/priorities.conf`` exists. ::
+
+#. Ensure ``priorities.conf`` enables the plugin. ::
+
+ [main]
+ enabled = 1
diff --git a/doc/man/8/rbd.rst b/doc/man/8/rbd.rst
index f50d93eb04c..2d78748f5f2 100644
--- a/doc/man/8/rbd.rst
+++ b/doc/man/8/rbd.rst
@@ -113,6 +113,10 @@ Parameters
Make json or xml formatted output more human-readable.
+.. option:: --read-only
+
+ Set device readonly when mapping image.
+
Commands
========
diff --git a/doc/rados/configuration/journal-ref.rst b/doc/rados/configuration/journal-ref.rst
index b7344544b9a..97300f4a57f 100644
--- a/doc/rados/configuration/journal-ref.rst
+++ b/doc/rados/configuration/journal-ref.rst
@@ -27,6 +27,7 @@ Ceph OSDs use a journal for two reasons: speed and consistency.
Ceph OSD Daemons support the following journal settings:
+
``journal dio``
:Description: Enables direct i/o to the journal. Requires ``journal block
@@ -37,14 +38,17 @@ Ceph OSD Daemons support the following journal settings:
:Default: ``true``
+
``journal aio``
+.. versionchanged:: 0.61 Cuttlefish
+
:Description: Enables using ``libaio`` for asynchronous writes to the journal.
Requires ``journal dio`` set to ``true``.
:Type: Boolean
:Required: No.
-:Default: ``false``
+:Default: Version 0.61 and later, ``true``. Version 0.60 and earlier, ``false``.
``journal block align``
diff --git a/doc/rados/operations/add-or-rm-mons.rst b/doc/rados/operations/add-or-rm-mons.rst
index 17ae9d86b85..e3bac1fca09 100644
--- a/doc/rados/operations/add-or-rm-mons.rst
+++ b/doc/rados/operations/add-or-rm-mons.rst
@@ -32,7 +32,7 @@ version of Linux installed (typically Ubuntu 12.04 precise).
Add your monitor host to a rack in your cluster, connect it to the network
and ensure that it has network connectivity.
-.. _Hardware Recommendations: ../../install/hardware-recommendations
+.. _Hardware Recommendations: ../../../start/hardware-recommendations
Install the Required Software
-----------------------------
@@ -42,17 +42,9 @@ manually. See `Installing Debian/Ubuntu Packages`_ for details.
You should configure SSH to a user with password-less authentication
and root permissions.
-.. _Installing Debian/Ubuntu Packages: ../../install/debian
+.. _Installing Debian/Ubuntu Packages: ../../../install/debian
-For clusters deployed with Chef, create a `chef user`_, `configure
-SSH keys`_, `install Ruby`_ and `install the Chef client`_ on your host. See
-`Installing Chef`_ for details.
-.. _chef user: ../../install/chef#createuser
-.. _configure SSH keys: ../../install/chef#genkeys
-.. _install the Chef client: ../../install/chef#installchef
-.. _Installing Chef: ../../install/chef
-.. _install Ruby: ../../install/chef#installruby
.. _Adding a Monitor (Manual):
diff --git a/doc/rados/operations/authentication.rst b/doc/rados/operations/authentication.rst
index 0b71d08b0c4..d9995da8fb8 100644
--- a/doc/rados/operations/authentication.rst
+++ b/doc/rados/operations/authentication.rst
@@ -126,18 +126,15 @@ you may skip the steps related to generating keys.
auth service required = cephx
auth client required = cephx
-#. Or, enable ``cephx`` authentication for versions ``0.50`` and below by
+#. Or, enable ``cephx`` authentication for Ceph versions ``0.50`` and below by
setting the following option in the ``[global]`` section of your `Ceph
- configuration`_ file::
+ configuration`_ file. **NOTE:** Deprecated as of version ``0.50``. ::
auth supported = cephx
-.. deprecated:: 0.51
-#. Start or restart the Ceph cluster. ::
+#. Start or restart the Ceph cluster. See `Operating a Cluster`_ for details.
- sudo service ceph -a start
- sudo service ceph -a restart
.. _disable-cephx:
@@ -157,6 +154,7 @@ during setup and/or troubleshooting to temporarily disable authentication.
auth cluster required = none
auth service required = none
auth client required = none
+ auth supported = none
#. Or, disable ``cephx`` authentication for versions ``0.50`` and below
(deprecated as of version 0.51) by setting the following option in the
@@ -164,10 +162,8 @@ during setup and/or troubleshooting to temporarily disable authentication.
auth supported = none
-#. Start or restart the Ceph cluster. ::
+#. Start or restart the Ceph cluster. See `Operating a Cluster`_ for details.
- sudo service ceph -a start
- sudo service ceph -a restart
Daemon Keyrings
@@ -422,3 +418,4 @@ of the enhanced authentication.
.. _Ceph configuration: ../../configuration/ceph-conf
.. _Cephx Configuration Reference: ../../configuration/auth-config-ref
+.. _Operating a Cluster: ../operating \ No newline at end of file
diff --git a/doc/rados/operations/operating.rst b/doc/rados/operations/operating.rst
index 9942ea3cabf..8c62ed5cdbf 100644
--- a/doc/rados/operations/operating.rst
+++ b/doc/rados/operations/operating.rst
@@ -7,11 +7,10 @@
Running Ceph with Upstart
=========================
-When deploying Ceph Cuttlefish and beyond with ``ceph-deploy``, you may start
-and stop Ceph daemons on a :term:`Ceph Node` using the event-based `Upstart`_.
-Upstart does not require you to define daemon instances in the Ceph configuration
-file (although, they are still required for ``sysvinit`` should you choose to
-use it).
+When deploying Ceph Cuttlefish and beyond with ``ceph-deploy`` on Debian/Ubuntu
+distributions, you may start and stop Ceph daemons on a :term:`Ceph Node` using
+the event-based `Upstart`_. Upstart does not require you to define daemon
+instances in the Ceph configuration file.
To list the Ceph Upstart jobs and instances on a node, execute::
@@ -19,6 +18,7 @@ To list the Ceph Upstart jobs and instances on a node, execute::
See `initctl`_ for additional details.
+
Starting all Daemons
--------------------
@@ -93,29 +93,20 @@ For example::
sudo start ceph-mds id=ceph-server
-
.. index:: Ceph service; sysvinit; operating a cluster
-Running Ceph as a Service
-=========================
+Running Ceph
+============
-When you deploy Ceph Argonaut or Bobtail with ``mkcephfs``, use the
-service or traditional sysvinit.
+Each time you to **start**, **restart**, and **stop** Ceph daemons (or your
+entire cluster) you must specify at least one option and one command. You may
+also specify a daemon type or a daemon instance. ::
-The ``ceph`` service provides functionality to **start**, **restart**, and
-**stop** your Ceph cluster. Each time you execute ``ceph`` processes, you
-must specify at least one option and one command. You may also specify a daemon
-type or a daemon instance. For most newer Debian/Ubuntu distributions, you may
-use the following syntax::
+ {commandline} [options] [commands] [daemons]
- sudo service ceph [options] [commands] [daemons]
-For older distributions, you may wish to use the ``/etc/init.d/ceph`` path::
-
- sudo /etc/init.d/ceph [options] [commands] [daemons]
-
-The ``ceph`` service options include:
+The ``ceph`` options include:
+-----------------+----------+-------------------------------------------------+
| Option | Shortcut | Description |
@@ -134,7 +125,7 @@ The ``ceph`` service options include:
| ``--conf`` | ``-c`` | Use an alternate configuration file. |
+-----------------+----------+-------------------------------------------------+
-The ``ceph`` service commands include:
+The ``ceph`` commands include:
+------------------+------------------------------------------------------------+
| Command | Description |
@@ -152,83 +143,213 @@ The ``ceph`` service commands include:
| ``cleanalllogs`` | Cleans out **everything** in the log directory. |
+------------------+------------------------------------------------------------+
-For subsystem operations, the ``ceph`` service can target specific daemon types by
-adding a particular daemon type for the ``[daemons]`` option. Daemon types include:
+For subsystem operations, the ``ceph`` service can target specific daemon types
+by adding a particular daemon type for the ``[daemons]`` option. Daemon types
+include:
- ``mon``
- ``osd``
- ``mds``
-The ``ceph`` service's ``[daemons]`` setting may also target a specific instance.
-To start a Ceph daemon on the local :term:`Ceph Node`, use the following syntax::
- sudo /etc/init.d/ceph start osd.0
+Running Ceph with sysvinit
+--------------------------
-To start a Ceph daemon on another node, use the following syntax::
-
- sudo /etc/init.d/ceph -a start osd.0
+Using traditional ``sysvinit`` is the recommended way to run Ceph with CentOS,
+Red Hat, Fedora, and SLES distributions. You may also use it for older
+distributions of Debian/Ubuntu.
-Where ``osd.0`` is the first OSD in the cluster.
-
-Starting a Cluster
-------------------
+Starting all Daemons
+~~~~~~~~~~~~~~~~~~~~
To start your Ceph cluster, execute ``ceph`` with the ``start`` command.
-The usage may differ based upon your Linux distribution. For example, for most
-newer Debian/Ubuntu distributions, you may use the following syntax::
-
- sudo service ceph [options] [start|restart] [daemonType|daemonID]
-
-For older distributions, you may wish to use the ``/etc/init.d/ceph`` path::
+Use the following syntax::
sudo /etc/init.d/ceph [options] [start|restart] [daemonType|daemonID]
The following examples illustrates a typical use case::
- sudo service ceph -a start
sudo /etc/init.d/ceph -a start
Once you execute with ``-a`` (i.e., execute on all nodes), Ceph should begin
-operating. You may also specify a particular daemon instance to constrain the
-command to a single instance. To start a Ceph daemon on the local Ceph Node,
-use the following syntax::
+operating.
+
+
+Stopping all Daemons
+~~~~~~~~~~~~~~~~~~~~
+
+To stop your Ceph cluster, execute ``ceph`` with the ``stop`` command.
+Use the following syntax::
+
+ sudo /etc/init.d/ceph [options] stop [daemonType|daemonID]
+
+The following examples illustrates a typical use case::
+
+ sudo /etc/init.d/ceph -a stop
+Once you execute with ``-a`` (i.e., execute on all nodes), Ceph should stop
+operating.
+
+
+Starting all Daemons by Type
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+To start all Ceph daemons of a particular type on the local Ceph Node, use the
+following syntax::
+
+ sudo /etc/init.d/ceph start {daemon-type}
+ sudo /etc/init.d/ceph start osd
+
+To start all Ceph daemons of a particular type on another node, use the
+following syntax::
+
+ sudo /etc/init.d/ceph -a start {daemon-type}
+ sudo /etc/init.d/ceph -a start osd
+
+
+Stopping all Daemons by Type
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+To stop all Ceph daemons of a particular type on the local Ceph Node, use the
+following syntax::
+
+ sudo /etc/init.d/ceph stop {daemon-type}
+ sudo /etc/init.d/ceph stop osd
+
+To stop all Ceph daemons of a particular type on another node, use the
+following syntax::
+
+ sudo /etc/init.d/ceph -a stop {daemon-type}
+ sudo /etc/init.d/ceph -a stop osd
+
+
+Starting a Daemon
+~~~~~~~~~~~~~~~~~
+
+To start a Ceph daemon on the local Ceph Node, use the following syntax::
+
+ sudo /etc/init.d/ceph start {daemon-type}.{instance}
sudo /etc/init.d/ceph start osd.0
To start a Ceph daemon on another node, use the following syntax::
+ sudo /etc/init.d/ceph -a start {daemon-type}.{instance}
sudo /etc/init.d/ceph -a start osd.0
-Stopping a Cluster
-------------------
+Stopping a Daemon
+~~~~~~~~~~~~~~~~~
+
+To stop a Ceph daemon on the local Ceph Node, use the following syntax::
+
+ sudo /etc/init.d/ceph stop {daemon-type}.{instance}
+ sudo /etc/init.d/ceph stop osd.0
+
+To stop a Ceph daemon on another node, use the following syntax::
+
+ sudo /etc/init.d/ceph -a stop {daemon-type}.{instance}
+ sudo /etc/init.d/ceph -a stop osd.0
+
+
+Running Ceph as a Service
+-------------------------
+
+When you deploy Ceph Argonaut or Bobtail with ``mkcephfs``, you operate
+Ceph as a service (you may also use sysvinit).
+
+
+Starting all Daemons
+~~~~~~~~~~~~~~~~~~~~
+
+To start your Ceph cluster, execute ``ceph`` with the ``start`` command.
+Use the following syntax::
+
+ sudo service ceph [options] [start|restart] [daemonType|daemonID]
+
+The following examples illustrates a typical use case::
+
+ sudo service ceph -a start
+
+Once you execute with ``-a`` (i.e., execute on all nodes), Ceph should begin
+operating.
+
+
+Stopping all Daemons
+~~~~~~~~~~~~~~~~~~~~
To stop your Ceph cluster, execute ``ceph`` with the ``stop`` command.
-The usage may differ based upon your Linux distribution. For example, for most
-newer Debian/Ubuntu distributions, you may use the following syntax::
+Use the following syntax::
sudo service ceph [options] stop [daemonType|daemonID]
For example::
- sudo service ceph -a stop
-
-For older distributions, you may wish to use the ``/etc/init.d/ceph`` path::
-
- sudo /etc/init.d/ceph -a stop
+ sudo service ceph -a stop
Once you execute with ``-a`` (i.e., execute on all nodes), Ceph should shut
-down. You may also specify a particular daemon instance to constrain the
-command to a single instance. To stop a Ceph daemon on the local Ceph Node,
-use the following syntax::
+down.
+
+
+Starting all Daemons by Type
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+To start all Ceph daemons of a particular type on the local Ceph Node, use the
+following syntax::
+
+ sudo service ceph start {daemon-type}
+ sudo service ceph start osd
+
+To start all Ceph daemons of a particular type on all nodes, use the following
+syntax::
+
+ sudo service ceph -a start {daemon-type}
+ sudo service ceph -a start osd
+
+
+Stopping all Daemons by Type
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+To stop all Ceph daemons of a particular type on the local Ceph Node, use the
+following syntax::
+
+ sudo service ceph stop {daemon-type}
+ sudo service ceph stop osd
+
+To stop all Ceph daemons of a particular type on all nodes, use the following
+syntax::
+
+ sudo service ceph -a stop {daemon-type}
+ sudo service ceph -a stop osd
- sudo /etc/init.d/ceph stop osd.0
+
+Starting a Daemon
+~~~~~~~~~~~~~~~~~
+
+To start a Ceph daemon on the local Ceph Node, use the following syntax::
+
+ sudo service ceph start {daemon-type}.{instance}
+ sudo service ceph start osd.0
+
+To start a Ceph daemon on another node, use the following syntax::
+
+ sudo service ceph -a start {daemon-type}.{instance}
+ sudo service ceph -a start osd.0
+
+
+Stopping a Daemon
+~~~~~~~~~~~~~~~~~
+
+To stop a Ceph daemon on the local Ceph Node, use the following syntax::
+
+ sudo service ceph stop {daemon-type}.{instance}
+ sudo service ceph stop osd.0
To stop a Ceph daemon on another node, use the following syntax::
- sudo /etc/init.d/ceph -a stop osd.0
+ sudo service ceph -a stop {daemon-type}.{instance}
+ sudo service ceph -a stop osd.0
diff --git a/doc/rbd/libvirt.rst b/doc/rbd/libvirt.rst
index cc8dc9bd189..4813c3258d0 100644
--- a/doc/rbd/libvirt.rst
+++ b/doc/rbd/libvirt.rst
@@ -40,46 +40,11 @@ The most common ``libvirt`` use case involves providing Ceph block devices to
cloud solutions like OpenStack or CloudStack. The cloud solution uses
``libvirt`` to interact with QEMU/KVM, and QEMU/KVM interacts with Ceph block
devices via ``librbd``. See `Block Devices and OpenStack`_ and `Block Devices
-and CloudStack`_ for details.
+and CloudStack`_ for details. See `Installation`_ for installation details.
You can also use Ceph block devices with ``libvirt``, ``virsh`` and the
``libvirt`` API. See `libvirt Virtualization API`_ for details.
-Prerequisites
-=============
-
-- `Install`_ and `configure`_ a Ceph cluster
-- `Install and configure`_ QEMU/KVM
-
-
-Installing ``libvirt`` on Ubuntu 12.04 Precise
-==============================================
-
-``libvirt`` packages are incorporated into the Ubuntu 12.04 precise
-distribution. To install ``libvirt`` on precise, execute the following::
-
- sudo apt-get update && sudo apt-get install libvirt-bin
-
-
-Installing ``libvirt`` on Earlier Versions of Ubuntu
-====================================================
-
-For Ubuntu distributions 11.10 oneiric and earlier, you must build ``libvirt``
-from source. Clone the ``libvirt`` repository, and use `AutoGen`_ to generate
-the build. Then, execute ``make`` and ``make install`` to complete the
-installation. For example::
-
- git clone git://libvirt.org/libvirt.git
- cd libvirt
- ./autogen.sh
- make
- sudo make install
-
-See `libvirt Installation`_ for details.
-
-
-Using Ceph with Virtual Machines
-================================
To create VMs that use Ceph block devices, use the procedures in the following
sections. In the exemplary embodiment, we've used ``libvirt-pool`` for the pool
@@ -89,7 +54,7 @@ when executing commands in the subsequent procedures.
Configuring Ceph
-----------------
+================
To configure Ceph for use with ``libvirt``, perform the following steps:
@@ -132,7 +97,7 @@ To configure Ceph for use with ``libvirt``, perform the following steps:
Preparing the VM Manager
-------------------------
+========================
You may use ``libvirt`` without a VM manager, but you may find it simpler to
create your first domain with ``virt-manager``.
@@ -150,7 +115,7 @@ create your first domain with ``virt-manager``.
Creating a VM
--------------
+=============
To create a VM with ``virt-manager``, perform the following steps:
@@ -182,7 +147,7 @@ To create a VM with ``virt-manager``, perform the following steps:
Configuring the VM
-------------------
+==================
When configuring the VM for use with Ceph, it is important to use ``virsh``
where appropriate. Additionally, ``virsh`` commands often require root
@@ -290,7 +255,7 @@ commands, refer to `Virsh Command Reference`_.
Summary
--------
+=======
Once you have configured the VM for use with Ceph, you can start the VM.
To verify that the VM and Ceph are communicating, you may perform the
@@ -320,13 +285,8 @@ If everything looks okay, you may begin using the Ceph block device
within your VM.
-
-.. _AutoGen: http://www.gnu.org/software/autogen/
-.. _libvirt Installation: http://www.libvirt.org/compiling.html
+.. _Installation: ../../install
.. _libvirt Virtualization API: http://www.libvirt.org
-.. _Install: ../../install
-.. _configure: ../../rados/configuration
-.. _Install and configure: ../qemu-rbd
.. _Block Devices and OpenStack: ../rbd-openstack
.. _Block Devices and CloudStack: ../rbd-cloudstack
.. _Create a pool: ../../rados/operations/pools#create-a-pool
diff --git a/doc/rbd/qemu-rbd.rst b/doc/rbd/qemu-rbd.rst
index 9d366f3ea8d..e0b55dee257 100644
--- a/doc/rbd/qemu-rbd.rst
+++ b/doc/rbd/qemu-rbd.rst
@@ -27,33 +27,12 @@ image each time it spins up a new virtual machine.
Ceph Block Devices can integrate with the QEMU virtual machine. For details on
QEMU, see `QEMU Open Source Processor Emulator`_. For QEMU documentation, see
-`QEMU Manual`_.
+`QEMU Manual`_. For installation details, see `Installation`_.
.. important:: To use Ceph Block Devices with QEMU, you must have access to a
running Ceph cluster.
-Installing QEMU (12.04 Precise and later)
-=========================================
-
-QEMU packages are incorporated into Ubuntu 12.04 Precise Pangolin and later
-versions. To install QEMU, execute the following::
-
- sudo apt-get install qemu
-
-Installing QEMU (11.10 Oneric and earlier)
-==========================================
-
-For Ubuntu distributions 11.10 Oneiric and earlier, you must install
-the 0.15 version of QEMU or later. To build QEMU from source, use the
-following procedure::
-
- cd {your-development-directory}
- git clone git://git.qemu.org/qemu.git
- cd qemu
- ./configure --enable-rbd
- make; make install
-
Creating Images with QEMU
=========================
@@ -199,4 +178,5 @@ QEMU command line settings override the Ceph configuration file settings.
.. _QEMU Open Source Processor Emulator: http://wiki.qemu.org/Main_Page
.. _QEMU Manual: http://wiki.qemu.org/Manual
.. _RBD Cache: ../rbd-config-ref/
-.. _Snapshots: ../rbd-snapshot/ \ No newline at end of file
+.. _Snapshots: ../rbd-snapshot/
+.. _Installation: ../../install \ No newline at end of file
diff --git a/doc/rbd/rbd-openstack.rst b/doc/rbd/rbd-openstack.rst
index 660757639aa..80dd43ce406 100644
--- a/doc/rbd/rbd-openstack.rst
+++ b/doc/rbd/rbd-openstack.rst
@@ -127,7 +127,7 @@ Hosts running ``nova-compute`` do not need the keyring. Instead, they
store the secret key in libvirt. Create a temporary copy of the secret
key on the hosts running ``nova-compute``::
- ssh {your-compute-host} client.volumes.key <`ceph auth get-key client.volumes`
+ ceph auth get-key client.volumes | ssh {your-compute-host} tee client.volumes.key
Then, on the compute hosts, add the secret key to libvirt and remove the
temporary copy of the key::
diff --git a/doc/release-notes.rst b/doc/release-notes.rst
index 604b4fa296b..2b566baa0ea 100644
--- a/doc/release-notes.rst
+++ b/doc/release-notes.rst
@@ -2,6 +2,37 @@
Release Notes
===============
+v0.70
+-----
+
+Upgrading
+~~~~~~~~~
+
+* librados::Rados::pool_create_async() and librados::Rados::pool_delete_async()
+ don't drop a reference to the completion object on error, caller needs to take
+ care of that. This has never really worked correctly and we were leaking an
+ object
+
+* 'ceph osd crush set <id> <weight> <loc..>' no longer adds the osd to the
+ specified location, as that's a job for 'ceph osd crush add'. It will
+ however continue to work just the same as long as the osd already exists
+ in the crush map.
+
+Notable Changes
+~~~~~~~~~~~~~~~
+
+* mon: a few 'ceph mon add' races fixed (command is now idempotent) (Joao Luis)
+* crush: fix name caching
+* rgw: fix a few minor memory leaks (Yehuda Sadeh)
+* ceph: improve parsing of CEPH_ARGS (Benoit Knecht)
+* mon: avoid rewriting full osdmaps on restart (Joao Luis)
+* crc32c: fix optimized crc32c code (it now detects arch support properly)
+* mon: fix 'ceph osd crush reweight ...' (Joao Luis)
+* osd: revert xattr size limit (fixes large rgw uploads)
+* mds: fix heap profiler commands (Joao Luis)
+* rgw: fix inefficient use of std::list::size() (Yehuda Sadeh)
+
+
v0.69
-----
@@ -19,6 +50,28 @@ Upgrading
the because the server-side behavior has changed it is possible that
an application misusing the interface may now get errors.
+* The OSD now enforces that class write methods cannot both mutate an
+ object and return data. The rbd.assign_bid method, the lone
+ offender, has been removed. This breaks compatibility with
+ pre-bobtail librbd clients by preventing them from creating new
+ images.
+
+* librados now returns on commit instead of ack for synchronous calls.
+ This is a bit safer in the case where both OSDs and the client crash, and
+ is probably how it should have been acting from the beginning. Users are
+ unlikely to notice but it could result in lower performance in some
+ circumstances. Those who care should switch to using the async interfaces,
+ which let you specify safety semantics precisely.
+
+* The C++ librados AioComplete::get_version() method was incorrectly
+ returning an int (usually 32-bits). To avoid breaking library
+ compatibility, a get_version64() method is added that returns the
+ full-width value. The old method is deprecated and will be removed
+ in a future release. Users of the C++ librados API that make use of
+ the get_version() method should modify their code to avoid getting a
+ value that is truncated from 64 to to 32 bits.
+
+
Notable Changes
~~~~~~~~~~~~~~~
@@ -120,6 +173,40 @@ Notable Changes
* sysvinit: add condrestart command (Dan van der Ster)
+
+v0.67.4 "Dumpling"
+------------------
+
+This point release fixes an important performance issue with radosgw,
+keystone authentication token caching, and CORS. All users
+(especially those of rgw) are encouraged to upgrade.
+
+Notable changes
+~~~~~~~~~~~~~~~
+
+* crush: fix invalidation of cached names
+* crushtool: do not crash on non-unique bucket ids
+* mds: be more careful when decoding LogEvents
+* mds: fix heap check debugging commands
+* mon: avoid rebuilding old full osdmaps
+* mon: fix 'ceph crush move ...'
+* mon: fix 'ceph osd crush reweight ...'
+* mon: fix writeout of full osdmaps during trim
+* mon: limit size of transactions
+* mon: prevent both unmanaged and pool snaps
+* osd: disable xattr size limit (prevents upload of large rgw objects)
+* osd: fix recovery op throttling
+* osd: fix throttling of log messages for very slow requests
+* rgw: drain pending requests before completing write
+* rgw: fix CORS
+* rgw: fix inefficient list::size() usage
+* rgw: fix keystone token expiration
+* rgw: fix minor memory leaks
+* rgw: fix null termination of buffer
+
+For more detailed information, see :download:`the complete changelog <changelog/v0.67.4.txt>`.
+
+
v0.67.3 "Dumpling"
------------------
diff --git a/doc/install/hardware-recommendations.rst b/doc/start/hardware-recommendations.rst
index 90d29e5e7e2..90d29e5e7e2 100644
--- a/doc/install/hardware-recommendations.rst
+++ b/doc/start/hardware-recommendations.rst
diff --git a/doc/start/index.rst b/doc/start/index.rst
index 2fc03c0a284..6e9277746d9 100644
--- a/doc/start/index.rst
+++ b/doc/start/index.rst
@@ -1,34 +1,6 @@
-=================
- Getting Started
-=================
-
-Whether you want to provide :term:`Ceph Object Storage` and/or :term:`Ceph Block
-Device` services to :term:`Cloud Platforms`, deploy a :term:`Ceph Filesystem` or
-use Ceph for another purpose, all :term:`Ceph Storage Cluster` deployments begin
-with setting up each :term:`Ceph Node`, your network and the Ceph Storage
-Cluster. A Ceph Storage Cluster has three essential daemons:
-
-.. ditaa:: +---------------+ +---------------+ +---------------+
- | OSDs | | Monitor | | MDS |
- +---------------+ +---------------+ +---------------+
-
-- **OSDs**: A :term:`Ceph OSD Daemon` (OSD) stores data, handles data
- replication, recovery, backfilling, rebalancing, and provides some monitoring
- information to Ceph Monitors by checking other Ceph OSD Daemons for a
- heartbeat. A Ceph Storage Cluster requires at least two Ceph OSD Daemons to
- achieve an ``active + clean`` state.
-
-- **Monitors**: A :term:`Ceph Monitor` maintains maps of the cluster state,
- including the monitor map, the OSD map, the Placement Group (PG) map, and the
- CRUSH map. Ceph maintains a history (called an "epoch") of each state change
- in the Ceph Monitors, Ceph OSD Daemons, and PGs.
-
-- **MDSs**: A :term:`Ceph Metadata Server` (MDS) stores metadata on behalf of
- the :term:`Ceph Filesystem` (i.e., Ceph Block Devices and Ceph Object Storage
- do not use MDS). Ceph Metadata Servers make it feasible for POSIX file system
- users to execute basic commands like ``ls``, ``find``, etc. without placing
- an enormous burden on the Ceph Storage Cluster.
-
+======================
+ Installation (Quick)
+======================
.. raw:: html
@@ -37,18 +9,17 @@ Cluster. A Ceph Storage Cluster has three essential daemons:
A :term:`Ceph Client` and a :term:`Ceph Node` may require some basic
configuration work prior to deploying a Ceph Storage Cluster. You can also
-avail yourself of help from the Ceph community by getting involved.
+avail yourself of help by getting involved in the Ceph community.
.. toctree::
- Get Involved <get-involved>
Preflight <quick-start-preflight>
.. raw:: html
</td><td><h3>Step 2: Storage Cluster</h3>
-Once you've completed your preflight checklist, you should be able to begin
+Once you've completed your preflight checklist, you should be able to begin
deploying a Ceph Storage Cluster.
.. toctree::
diff --git a/doc/start/intro.rst b/doc/start/intro.rst
new file mode 100644
index 00000000000..704ff1e8cd5
--- /dev/null
+++ b/doc/start/intro.rst
@@ -0,0 +1,70 @@
+===============
+ Intro to Ceph
+===============
+
+Whether you want to provide :term:`Ceph Object Storage` and/or :term:`Ceph Block
+Device` services to :term:`Cloud Platforms`, deploy a :term:`Ceph Filesystem` or
+use Ceph for another purpose, all :term:`Ceph Storage Cluster` deployments begin
+with setting up each :term:`Ceph Node`, your network and the Ceph Storage
+Cluster. A Ceph Storage Cluster requires at least one Ceph Monitor and at least
+two Ceph OSD Daemons. The Ceph Metadata Server is essential when running Ceph
+Filesystem clients.
+
+.. ditaa:: +---------------+ +---------------+ +---------------+
+ | OSDs | | Monitor | | MDS |
+ +---------------+ +---------------+ +---------------+
+
+- **OSDs**: A :term:`Ceph OSD Daemon` (OSD) stores data, handles data
+ replication, recovery, backfilling, rebalancing, and provides some monitoring
+ information to Ceph Monitors by checking other Ceph OSD Daemons for a
+ heartbeat. A Ceph Storage Cluster requires at least two Ceph OSD Daemons to
+ achieve an ``active + clean`` state when the cluster makes two copies of your
+ data (Ceph makes 2 copies by default, but you can adjust it).
+
+- **Monitors**: A :term:`Ceph Monitor` maintains maps of the cluster state,
+ including the monitor map, the OSD map, the Placement Group (PG) map, and the
+ CRUSH map. Ceph maintains a history (called an "epoch") of each state change
+ in the Ceph Monitors, Ceph OSD Daemons, and PGs.
+
+- **MDSs**: A :term:`Ceph Metadata Server` (MDS) stores metadata on behalf of
+ the :term:`Ceph Filesystem` (i.e., Ceph Block Devices and Ceph Object Storage
+ do not use MDS). Ceph Metadata Servers make it feasible for POSIX file system
+ users to execute basic commands like ``ls``, ``find``, etc. without placing
+ an enormous burden on the Ceph Storage Cluster.
+
+Ceph stores a client's data as objects within storage pools. Using the CRUSH
+algorithm, Ceph calculates which placement group should contain the object,
+and further calculates which Ceph OSD Daemon should store the placement group.
+The CRUSH algorithm enables the Ceph Storage Cluster to scale, rebalance, and
+recover dynamically.
+
+
+.. raw:: html
+
+ <style type="text/css">div.body h3{margin:5px 0px 0px 0px;}</style>
+ <table cellpadding="10"><colgroup><col width="50%"><col width="50%"></colgroup><tbody valign="top"><tr><td><h3>Recommendations</h3>
+
+To begin using Ceph in production, you should review our hardware
+recommendations and operating system recommendations.
+
+.. toctree::
+ :maxdepth: 2
+
+ Hardware Recommendations <hardware-recommendations>
+ OS Recommendations <os-recommendations>
+
+
+.. raw:: html
+
+ </td><td><h3>Get Involved</h3>
+
+ You can avail yourself of help or contribute documentation, source
+ code or bugs by getting involved in the Ceph community.
+
+.. toctree::
+
+ get-involved
+
+.. raw:: html
+
+ </td></tr></tbody></table>
diff --git a/doc/install/os-recommendations.rst b/doc/start/os-recommendations.rst
index 71a4d3a278b..d8b418fe1b0 100644
--- a/doc/install/os-recommendations.rst
+++ b/doc/start/os-recommendations.rst
@@ -36,6 +36,36 @@ platforms. Generally speaking, there is very little dependence on
specific distributions aside from the kernel and system initialization
package (i.e., sysvinit, upstart, systemd).
+
+Dumpling (0.67)
+---------------
+
++----------+----------+--------------------+--------------+---------+------------+
+| Distro | Release | Code Name | Kernel | Notes | Testing |
++==========+==========+====================+==============+=========+============+
+| Ubuntu | 12.04 | Precise Pangolin | linux-3.2.0 | 1, 2 | B, I, C |
++----------+----------+--------------------+--------------+---------+------------+
+| Ubuntu | 12.10 | Quantal Quetzal | linux-3.5.4 | 2 | B |
++----------+----------+--------------------+--------------+---------+------------+
+| Ubuntu | 13.04 | Raring Ringtail | linux-3.8.5 | | B |
++----------+----------+--------------------+--------------+---------+------------+
+| Debian | 6.0 | Squeeze | linux-2.6.32 | 1, 2, 3 | B |
++----------+----------+--------------------+--------------+---------+------------+
+| Debian | 7.0 | Wheezy | linux-3.2.0 | 1, 2 | B |
++----------+----------+--------------------+--------------+---------+------------+
+| CentOS | 6.3 | N/A | linux-2.6.32 | 1, 2 | B, I |
++----------+----------+--------------------+--------------+---------+------------+
+| RHEL | 6.3 | | linux-2.6.32 | 1, 2 | B, I |
++----------+----------+--------------------+--------------+---------+------------+
+| Fedora | 18.0 | Spherical Cow | linux-3.6.0 | | B |
++----------+----------+--------------------+--------------+---------+------------+
+| Fedora | 19.0 | Schrödinger's Cat | linux-3.10.0 | | B |
++----------+----------+--------------------+--------------+---------+------------+
+| OpenSuse | 12.2 | N/A | linux-3.4.0 | 2 | B |
++----------+----------+--------------------+--------------+---------+------------+
+
+
+
Cuttlefish (0.61)
-----------------
@@ -63,6 +93,7 @@ Cuttlefish (0.61)
| OpenSuse | 12.2 | N/A | linux-3.4.0 | 2 | B |
+----------+----------+--------------------+--------------+---------+------------+
+
Bobtail (0.56)
--------------
@@ -90,6 +121,7 @@ Bobtail (0.56)
| OpenSuse | 12.2 | N/A | linux-3.4.0 | 2 | B |
+----------+----------+--------------------+--------------+---------+------------+
+
Argonaut (0.48)
---------------
@@ -126,6 +158,7 @@ Notes
``ceph-osd`` daemons using ``XFS`` or ``ext4`` on the same host will
not perform as well as they could.
+
Testing
-------
diff --git a/doc/start/quick-ceph-deploy.rst b/doc/start/quick-ceph-deploy.rst
index 3c0ca1b0653..1fabd1b182f 100644
--- a/doc/start/quick-ceph-deploy.rst
+++ b/doc/start/quick-ceph-deploy.rst
@@ -3,26 +3,31 @@
=============================
If you haven't completed your `Preflight Checklist`_, do that first. This
-**Quick Start** sets up a two-node demo cluster so you can explore some of the
-:term:`Ceph Storage Cluster` functionality. This **Quick Start** will help you
-install a minimal Ceph Storage Cluster on a server node from your admin node
-using ``ceph-deploy``.
+**Quick Start** sets up a :term:`Ceph Storage Cluster` using ``ceph-deploy``
+on your admin node. Create a three Ceph Node cluster so you can
+explore Ceph functionality.
.. ditaa::
- /----------------\ /----------------\
- | Admin Node |<------->| Server Node |
- | cCCC | | cCCC |
- +----------------+ +----------------+
- | Ceph Commands | | ceph - mon |
- \----------------/ +----------------+
- | ceph - osd |
- +----------------+
- | ceph - mds |
- \----------------/
-
-
-For best results, create a directory on your admin node for maintaining the
-configuration of your cluster. ::
+ /------------------\ /----------------\
+ | Admin Node | | ceph–node1 |
+ | +-------->+ cCCC |
+ | ceph–deploy | | mon.ceph–node1 |
+ \---------+--------/ \----------------/
+ |
+ | /----------------\
+ | | ceph–node2 |
+ +----------------->+ cCCC |
+ | | osd.0 |
+ | \----------------/
+ |
+ | /----------------\
+ | | ceph–node3 |
+ +----------------->| cCCC |
+ | osd.1 |
+ \----------------/
+
+For best results, create a directory on your admin node node for maintaining the
+configuration that ``ceph-deploy`` generates for your cluster. ::
mkdir my-cluster
cd my-cluster
@@ -31,228 +36,283 @@ configuration of your cluster. ::
current directory. Ensure you are in this directory when executing
``ceph-deploy``.
+As a first exercise, create a Ceph Storage Cluster with one Ceph Monitor and two
+Ceph OSD Daemons. Once the cluster reaches a ``active + clean`` state, expand it
+by adding a third Ceph OSD Daemon, a Metadata Server and two more Ceph Monitors.
+
+.. important:: Do not call ``ceph-deploy`` with ``sudo`` or run it as ``root``
+ if you are logged in as a different user, because it will not issue ``sudo``
+ commands needed on the remote host.
Create a Cluster
================
-To create your Ceph Storage Cluster, declare its initial monitors, generate a
-filesystem ID (``fsid``) and generate monitor keys by entering the following
-command on a commandline prompt::
+If at any point you run into trouble and you want to start over, execute
+the following::
- ceph-deploy new {mon-server-name}
- ceph-deploy new mon-ceph-node
+ ceph-deploy purgedata {ceph-node} [{ceph-node}]
+ ceph-deploy forgetkeys
-Check the output of ``ceph-deploy`` with ``ls`` and ``cat`` in the current
-directory. You should see a Ceph configuration file, a keyring, and a log file
-for the new cluster. See `ceph-deploy new -h`_ for additional details.
-.. topic:: Single Node Quick Start
+On your admin node, perform the following steps using ``ceph-deploy``.
- Assuming only one node for your Ceph Storage Cluster, you will need to
- modify the default ``osd crush chooseleaf type`` setting (it defaults to
- ``1`` for ``node``) to ``0`` for ``device`` so that it will peer with OSDs
- on the local node. Add the following line to your Ceph configuration file::
-
- osd crush chooseleaf type = 0
+#. Create the cluster. ::
-.. tip:: If you deploy without executing foregoing step on a single node
- cluster, your Ceph Storage Cluster will not achieve an ``active + clean``
- state. To remedy this situation, you must modify your `CRUSH Map`_.
+ ceph-deploy new {ceph-node}
+ ceph-deploy new ceph-node1
-Install Ceph
-============
+ Check the output of ``ceph-deploy`` with ``ls`` and ``cat`` in the current
+ directory. You should see a Ceph configuration file, a keyring, and a log
+ file for the new cluster. See `ceph-deploy new -h`_ for additional details.
-To install Ceph on your server node, open a command line on your admin
-node and type the following::
+#. Install Ceph. ::
- ceph-deploy install {server-node-name}[,{server-node-name}]
- ceph-deploy install mon-ceph-node
+ ceph-deploy install {ceph-node}[{ceph-node} ...]
+ ceph-deploy install ceph-node1 ceph-node2 ceph-node3
-Without additional arguments, ``ceph-deploy`` will install the most recent
-stable Ceph package to the server node. See `ceph-deploy install -h`_ for
-additional details.
-.. tip:: When ``ceph-deploy`` completes installation successfully,
- it should echo ``OK``.
+#. Add a Ceph Monitor. ::
+ ceph-deploy mon create {ceph-node}
+ ceph-deploy mon create ceph-node1
+
+#. Gather keys. ::
-Add a Monitor
-=============
+ ceph-deploy gatherkeys {ceph-node}
+ ceph-deploy gatherkeys ceph-node1
-To run a Ceph cluster, you need at least one Ceph Monitor. When using
-``ceph-deploy``, the tool enforces a single Ceph Monitor per node. Execute the
-following to create a Ceph Monitor::
+ Once you have gathered keys, your local directory should have the following
+ keyrings:
- ceph-deploy mon create {mon-server-name}
- ceph-deploy mon create mon-ceph-node
+ - ``{cluster-name}.client.admin.keyring``
+ - ``{cluster-name}.bootstrap-osd.keyring``
+ - ``{cluster-name}.bootstrap-mds.keyring``
+
-.. tip:: In production environments, we recommend running Ceph Monitors on
- nodes that do not run OSDs.
+#. Add two OSDs. For fast setup, this quick start uses a directory rather
+ than an entire disk per Ceph OSD Daemon. See `ceph-deploy osd`_ for
+ details on using separate disks/partitions for OSDs and journals.
+ Login to the Ceph Nodes and create a directory for
+ the Ceph OSD Daemon. ::
+
+ ssh ceph-node2
+ sudo mkdir /tmp/osd0
+ exit
+
+ ssh ceph-node3
+ sudo mkdir /tmp/osd1
+ exit
-When you have added a monitor successfully, directories under ``/var/lib/ceph``
-on your server node should have subdirectories ``bootstrap-mds`` and
-``bootstrap-osd`` that contain keyrings. If these directories do not contain
-keyrings, execute ``ceph-deploy mon create`` again on the admin node.
+ Then, from your admin node, use ``ceph-deploy`` to prepare the OSDs. ::
+ ceph-deploy osd prepare {ceph-node}:/path/to/directory
+ ceph-deploy osd prepare ceph-node2:/tmp/osd0 ceph-node3:/tmp/osd1
-Gather Keys
-===========
+ Finally, activate the OSDs. ::
-To deploy additional daemons and provision them with monitor authentication keys
-from your admin node, you must first gather keys from a monitor node. Execute
-the following to gather keys::
+ ceph-deploy osd activate {ceph-node}:/path/to/directory
+ ceph-deploy osd activate ceph-node2:/tmp/osd0 ceph-node3:/tmp/osd1
- ceph-deploy gatherkeys {mon-server-name}
- ceph-deploy gatherkeys mon-ceph-node
+#. Use ``ceph-deploy`` to copy the configuration file and admin key to
+ your admin node and your Ceph Nodes so that you can use the ``ceph``
+ CLI without having to specify the monitor address and
+ ``ceph.client.admin.keyring`` each time you execute a command. ::
+
+ ceph-deploy admin {ceph-node}
+ ceph-deploy admin admin-node ceph-node1 ceph-node2 ceph-node3
-Once you have gathered keys, your local directory should have the following keyrings:
+ **Note:** Since you are using ``ceph-deploy`` to talk to the
+ local host, your host must be reachable by its hostname
+ (e.g., you can modify ``/etc/hosts`` if necessary). Ensure that
+ you have the correct permissions for the ``ceph.client.admin.keyring``.
-- ``{cluster-name}.client.admin.keyring``
-- ``{cluster-name}.bootstrap-osd.keyring``
-- ``{cluster-name}.bootstrap-mds.keyring``
+#. Check your cluster's health. ::
-If you don't have these keyrings, you may not have created a monitor successfully,
-or you may have a problem with your network connection. Ensure that you complete
-this step such that you have the foregoing keyrings before proceeding further.
+ ceph health
-.. tip:: You may repeat this procedure. If it fails, check to see if the
- ``/var/lib/ceph/boostrap-{osd}|{mds}`` directories on the server node
- have keyrings. If they do not have keyrings, try adding the monitor again;
- then, return to this step.
+ Your cluster should return an ``active + clean`` state when it
+ has finished peering.
-Add Ceph OSD Daemons
-====================
+Operating Your Cluster
+======================
-For a cluster's object placement groups to reach an ``active + clean`` state,
-you must have at least two instances of a :term:`Ceph OSD Daemon` running and
-at least two copies of an object (``osd pool default size`` is ``2``
-by default).
+Deploying a Ceph cluster with ``ceph-deploy`` automatically starts the cluster.
+To operate the cluster daemons with Debian/Ubuntu distributions, see
+`Running Ceph with Upstart`_. To operate the cluster daemons with CentOS,
+Red Hat, Fedora, and SLES distributions, see `Running Ceph with sysvinit`_.
-Adding Ceph OSD Daemons is slightly more involved than other ``ceph-deploy``
-commands, because a Ceph OSD Daemon involves both a data store and a journal.
-The ``ceph-deploy`` tool has the ability to invoke ``ceph-disk-prepare`` to
-prepare the disk and activate the Ceph OSD Daemon for you.
+To learn more about peering and cluster health, see `Monitoring a Cluster`_.
+To learn more about Ceph OSD Daemon and placement group health, see
+`Monitoring OSDs and PGs`_.
+
+Once you deploy a Ceph cluster, you can try out some of the administration
+functionality, the ``rados`` object store command line, and then proceed to
+Quick Start guides for Ceph Block Device, Ceph Filesystem, and the Ceph Object
+Gateway.
-Multiple OSDs on the OS Disk (Demo Only)
-----------------------------------------
-For demonstration purposes, you may wish to add multiple OSDs to the OS disk
-(not recommended for production systems). To use Ceph OSDs daemons on the OS
-disk, you must use ``prepare`` and ``activate`` as separate steps. First,
-define a directory for the Ceph OSD daemon(s). ::
-
- mkdir /tmp/osd0
- mkdir /tmp/osd1
-
-Then, use ``prepare`` to prepare the directory(ies) for use with a
-Ceph OSD Daemon. ::
-
- ceph-deploy osd prepare {osd-node-name}:/tmp/osd0
- ceph-deploy osd prepare {osd-node-name}:/tmp/osd1
+Expanding Your Cluster
+======================
-Finally, use ``activate`` to activate the Ceph OSD Daemons. ::
+Once you have a basic cluster up and running, the next step is to expand
+cluster. Add a Ceph OSD Daemon and a Ceph Metadata Server to ``ceph-node1``.
+Then add a Ceph Monitor to ``ceph-node2`` and ``ceph-node3`` to establish a
+quorum of Ceph Monitors.
- ceph-deploy osd activate {osd-node-name}:/tmp/osd0
- ceph-deploy osd activate {osd-node-name}:/tmp/osd1
+.. ditaa::
+ /------------------\ /----------------\
+ | ceph–deploy | | ceph–node1 |
+ | Admin Node | | cCCC |
+ | +-------->+ mon.ceph–node1 |
+ | | | osd.2 |
+ | | | mds.ceph–node1 |
+ \---------+--------/ \----------------/
+ |
+ | /----------------\
+ | | ceph–node2 |
+ | | cCCC |
+ +----------------->+ |
+ | | osd.0 |
+ | | mon.ceph–node2 |
+ | \----------------/
+ |
+ | /----------------\
+ | | ceph–node3 |
+ | | cCCC |
+ +----------------->+ |
+ | osd.1 |
+ | mon.ceph–node3 |
+ \----------------/
-.. tip:: You need two OSDs to reach an ``active + clean`` state. You can
- add one OSD at a time, but OSDs need to communicate with each other
- for Ceph to run properly. Always use more than one OSD per cluster.
+Adding an OSD
+-------------
+Since you are running a 3-node cluster for demonstration purposes, add the OSD
+to the monitor node. ::
-List Disks
-----------
+ ssh ceph-node1
+ sudo mkdir /tmp/osd2
+ exit
-To list the available disk drives on a prospective :term:`Ceph Node`, execute
-the following::
+Then, from your ``ceph-deploy`` node, prepare the OSD. ::
- ceph-deploy disk list {osd-node-name}
- ceph-deploy disk list ceph-node
+ ceph-deploy osd prepare {ceph-node}:/path/to/directory
+ ceph-deploy osd prepare ceph-node1:/tmp/osd2
+Finally, activate the OSDs. ::
-Zap a Disk
-----------
+ ceph-deploy osd activate {ceph-node}:/path/to/directory
+ ceph-deploy osd activate ceph-node1:/tmp/osd2
-To zap a disk (delete its partition table) in preparation for use with Ceph,
-execute the following::
- ceph-deploy disk zap {osd-node-name}:{disk}
- ceph-deploy disk zap ceph-node:sdb ceph-node:sdb2
+Once you have added your new OSD, Ceph will begin rebalancing the cluster by
+migrating placement groups to your new OSD. You can observe this process with
+the ``ceph`` CLI. ::
-.. important:: This will delete all data on the disk.
+ ceph -w
+You should see the placement group states change from ``active+clean`` to active
+with some degraded objects, and finally ``active+clean`` when migration
+completes. (Control-c to exit.)
-Add OSDs on Standalone Disks
-----------------------------
-You can add OSDs using ``prepare`` and ``activate`` in two discrete
-steps. To prepare a disk for use with a Ceph OSD Daemon, execute the
-following::
+Add a Metadata Server
+---------------------
- ceph-deploy osd prepare {osd-node-name}:{osd-disk-name}[:/path/to/journal]
- ceph-deploy osd prepare ceph-node:sdb
+To use CephFS, you need at least one metadata server. Execute the following to
+create a metadata server::
-To activate the Ceph OSD Daemon, execute the following::
+ ceph-deploy mds create {ceph-node}
+ ceph-deploy mds create ceph-node1
- ceph-deploy osd activate {osd-node-name}:{osd-partition-name}
- ceph-deploy osd activate ceph-node:sdb1
-To prepare an OSD disk and activate it in one step, execute the following::
+.. note:: Currently Ceph runs in production with one metadata server only. You
+ may use more, but there is currently no commercial support for a cluster
+ with multiple metadata servers.
- ceph-deploy osd create {osd-node-name}:{osd-disk-name}[:/path/to/journal] [{osd-node-name}:{osd-disk-name}[:/path/to/journal]]
- ceph-deploy osd create ceph-node:sdb:/dev/ssd1 ceph-node:sdc:/dev/ssd2
+Adding Monitors
+---------------
-.. note:: The journal example assumes you will use a partition on a separate
- solid state drive (SSD). If you omit a journal drive or partition,
- ``ceph-deploy`` will use create a separate partition for the journal
- on the same drive. If you have already formatted your disks and created
- partitions, you may also use partition syntax for your OSD disk.
+A Ceph Storage Cluster requires at least one Ceph Monitor to run. For high
+availability, Ceph Storage Clusters typically run multiple Ceph
+Monitors so that the failure of a single Ceph Monitor will not bring down the
+Ceph Storage Cluster. Ceph uses the Paxos algorithm, which requires a majority
+of monitors (i.e., 1, 2:3, 3:4, 3:5, 4:6, etc.) to form a quorum.
-You must add a minimum of two Ceph OSD Daemons for the placement groups in
-a cluster to achieve an ``active + clean`` state.
+Add two Ceph Monitors to your cluster. ::
+ ceph-deploy mon create {ceph-node}
+ ceph-deploy mon create ceph-node2 ceph-node3
-Add a MDS
-=========
+Once you have added your new Ceph Monitors, Ceph will begin synchronizing
+the monitors and form a quorum. You can check the quorum status by executing
+the following::
-To use CephFS, you need at least one metadata node. Execute the following to
-create a metadata node::
+ ceph quorum_status
- ceph-deploy mds create {node-name}
- ceph-deploy mds create ceph-node
-.. note:: Currently Ceph runs in production with one metadata node only. You
- may use more, but there is currently no commercial support for a cluster
- with multiple metadata nodes.
+Storing/Retrieving Object Data
+==============================
+To store object data in the Ceph Storage Cluster, a Ceph client must:
-Summary
-=======
+#. Set an object name
+#. Specify a `pool`_
-Deploying a Ceph cluster with ``ceph-deploy`` automatically starts the cluster.
-To operate the cluster daemons, see `Running Ceph with Upstart`_.
+The Ceph Client retrieves the latest cluster map and the CRUSH algorithm
+calculates how to map the object to a `placement group`_, and then calculates
+how to assign the placement group to a Ceph OSD Daemon dynamically. To find the
+object location, all you need is the object name and the pool name. For
+example::
-Once you deploy a Ceph cluster, you can try out some of the administration
-functionality, the object store command line, and then proceed to Quick Start
-guides for RBD, CephFS, and the Ceph Gateway.
+ ceph osd map {poolname} {object-name}
-.. topic:: Other ceph-deploy Commands
+.. topic:: Exercise: Locate an Object
- To view other ``ceph-deploy`` commands, execute:
-
- ``ceph-deploy -h``
-
+ As an exercise, lets create an object. Specify an object name, a path to
+ a test file containing some object data and a pool name using the
+ ``rados put`` command on the command line. For example::
+
+ rados put {object-name} {file-path} --pool=data
+ rados put test-object-1 testfile.txt --pool=data
+
+ To verify that the Ceph Storage Cluster stored the object, execute
+ the following::
+
+ rados -p data ls
+
+ Now, identify the object location::
-See `Ceph Deploy`_ for additional details.
+ ceph osd map {pool-name} {object-name}
+ ceph osd map data test-object-1
+
+ Ceph should output the object's location. For example::
+
+ osdmap e537 pool 'data' (0) object 'test-object-1' -> pg 0.d1743484 (0.4) -> up [1,0] acting [1,0]
+
+ To remove the test object, simply delete it using the ``rados rm``
+ command. For example::
+
+ rados rm test-object-1 --pool=data
+
+As the cluster evolves, the object location may change dynamically. One benefit
+of Ceph's dynamic rebalancing is that Ceph relieves you from having to perform
+the migration manually.
.. _Preflight Checklist: ../quick-start-preflight
.. _Ceph Deploy: ../../rados/deployment
.. _ceph-deploy install -h: ../../rados/deployment/ceph-deploy-install
.. _ceph-deploy new -h: ../../rados/deployment/ceph-deploy-new
+.. _ceph-deploy osd: ../../rados/deployment/ceph-deploy-osd
.. _Running Ceph with Upstart: ../../rados/operations/operating#running-ceph-with-upstart
-.. _CRUSH Map: ../../rados/operations/crush-map \ No newline at end of file
+.. _Running Ceph with sysvinit: ../../rados/operations/operating#running-ceph-with-sysvinit
+.. _CRUSH Map: ../../rados/operations/crush-map
+.. _pool: ../../rados/operations/pools
+.. _placement group: ../../rados/operations/placement-groups
+.. _Monitoring a Cluster: ../../rados/operations/monitoring
+.. _Monitoring OSDs and PGs: ../../rados/operations/monitoring-osd-pg \ No newline at end of file
diff --git a/doc/start/quick-cephfs.rst b/doc/start/quick-cephfs.rst
index 18dadb005ec..5449e5a6fe3 100644
--- a/doc/start/quick-cephfs.rst
+++ b/doc/start/quick-cephfs.rst
@@ -3,7 +3,7 @@
=====================
To use the :term:`Ceph FS` Quick Start guide, you must have executed the
-procedures in the `Ceph Deploy Quick Start`_ guide first. Execute this quick
+procedures in the `Storage Cluster Quick Start`_ guide first. Execute this quick
start on the Admin Host.
Prerequisites
@@ -91,7 +91,7 @@ See `Ceph FS`_ for additional information. Ceph FS is not quite as stable
as the Ceph Block Device and Ceph Object Storage. See `Troubleshooting`_
if you encounter trouble.
-.. _Ceph Deploy Quick Start: ../quick-ceph-deploy
+.. _Storage Cluster Quick Start: ../quick-ceph-deploy
.. _Ceph FS: ../../cephfs/
.. _FAQ: http://wiki.ceph.com/03FAQs/01General_FAQ#How_Can_I_Give_Ceph_a_Try.3F
.. _Troubleshooting: ../../cephfs/troubleshooting \ No newline at end of file
diff --git a/doc/start/quick-rbd.rst b/doc/start/quick-rbd.rst
index a466771502d..9424457f8c2 100644
--- a/doc/start/quick-rbd.rst
+++ b/doc/start/quick-rbd.rst
@@ -2,47 +2,73 @@
Block Device Quick Start
==========================
-To use this guide, you must have executed the procedures in the `Object Store
-Quick Start`_ guide first. Ensure your :term:`Ceph Storage Cluster` is in an
-``active + clean`` state before working with the :term:`Ceph Block Device`.
-Execute this quick start on the admin node.
+To use this guide, you must have executed the procedures in the `Storage
+Cluster Quick Start`_ guide first. Ensure your :term:`Ceph Storage Cluster` is
+in an ``active + clean`` state before working with the :term:`Ceph Block
+Device`.
.. note:: The Ceph Block Device is also known as :term:`RBD` or :term:`RADOS`
Block Device.
-#. Install ``ceph-common``. ::
- sudo apt-get install ceph-common
+.. ditaa::
+ /------------------\ /----------------\
+ | Admin Node | | ceph–client |
+ | +-------->+ cCCC |
+ | ceph–deploy | | ceph |
+ \------------------/ \----------------/
-#. Create a block device image. ::
- rbd create foo --size 4096 [-m {mon-IP}] [-k /path/to/ceph.client.admin.keyring]
+You may use a virtual machine for your ``ceph-client`` node, but do not
+execute the following procedures on the same physical node as your Ceph
+Storage Cluster nodes (unless you use a VM). See `FAQ`_ for details.
-#. Load the ``rbd`` client module. ::
+
+Install Ceph
+============
+
+#. On the admin node, use ``ceph-deploy`` to install Ceph on your
+ ``ceph-client`` node. ::
+
+ ceph-deploy install ceph-client
+
+#. On the admin node, use ``ceph-deploy`` to copy the Ceph configuration file
+ and the ``ceph.client.admin.keyring`` to the ``ceph-client``. ::
+
+ ceph-deploy admin ceph-client
+
+
+Configure a Block Device
+========================
+
+#. On the ``ceph-client`` node, create a block device image. ::
+
+ rbd create foo --size 4096 [-m {mon-IP}] [-k /path/to/ceph.client.admin.keyring]
+
+#. On the ``ceph-client`` node, load the ``rbd`` client module. ::
sudo modprobe rbd
-#. Map the image to a block device. ::
+#. On the ``ceph-client`` node, map the image to a block device. ::
sudo rbd map foo --pool rbd --name client.admin [-m {mon-IP}] [-k /path/to/ceph.client.admin.keyring]
-#. Use the block device. In the following example, create a file system. ::
+#. Use the block device by creating a file system on the ``ceph-client``
+ node. ::
sudo mkfs.ext4 -m0 /dev/rbd/rbd/foo
This may take a few moments.
-#. Mount the file system. ::
+#. Mount the file system on the ``ceph-client`` node. ::
sudo mkdir /mnt/ceph-block-device
sudo mount /dev/rbd/rbd/foo /mnt/ceph-block-device
cd /mnt/ceph-block-device
-.. note:: Mount the block device on the client machine,
- not the server machine. See `FAQ`_ for details.
See `block devices`_ for additional details.
-.. _Object Store Quick Start: ../quick-ceph-deploy
+.. _Storage Cluster Quick Start: ../quick-ceph-deploy
.. _block devices: ../../rbd/rbd
.. _FAQ: http://wiki.ceph.com/03FAQs/01General_FAQ#How_Can_I_Give_Ceph_a_Try.3F
diff --git a/doc/start/quick-rgw.rst b/doc/start/quick-rgw.rst
index af48a3154c1..40cf7d4f4dc 100644
--- a/doc/start/quick-rgw.rst
+++ b/doc/start/quick-rgw.rst
@@ -2,7 +2,7 @@
Object Storage Quick Start
============================
-To use this guide, you must have executed the procedures in the `Ceph Deploy
+To use this guide, you must have executed the procedures in the `Storage Cluster
Quick Start`_ guide first. Ensure your :term:`Ceph Storage Cluster` is in an
``active + clean`` state before working with the :term:`Ceph Object Storage`.
@@ -344,7 +344,7 @@ tutorials. See the `S3-compatible`_ and `Swift-compatible`_ APIs for details.
.. _Create rgw.conf: ../../radosgw/config/index.html#create-rgw-conf
-.. _Ceph Deploy Quick Start: ../quick-ceph-deploy
+.. _Storage Cluster Quick Start: ../quick-ceph-deploy
.. _Ceph Object Storage Manual Install: ../../radosgw/manual-install
.. _RGW Configuration: ../../radosgw/config
.. _S3-compatible: ../../radosgw/s3
diff --git a/doc/start/quick-start-preflight.rst b/doc/start/quick-start-preflight.rst
index 74dc403c211..77a54795f19 100644
--- a/doc/start/quick-start-preflight.rst
+++ b/doc/start/quick-start-preflight.rst
@@ -4,74 +4,57 @@
.. versionadded:: 0.60
-Thank you for trying Ceph! Petabyte-scale data clusters are quite an
-undertaking. Before delving deeper into Ceph, we recommend setting up a two-node
-demo cluster to explore some of the functionality. This **Preflight Checklist**
-will help you prepare an admin node and a server node for use with
-``ceph-deploy``.
-
-.. ditaa::
- /----------------\ /----------------\
- | Admin Node |<------->| Server Node |
- | cCCC | | cCCC |
- \----------------/ \----------------/
-
-
-Before you can deploy Ceph using ``ceph-deploy``, you need to ensure that you
-have a few things set up first on your admin node and on nodes running Ceph
-daemons.
-
-
-Install an Operating System
-===========================
-
-Install a recent release of Debian or Ubuntu (e.g., 12.04, 12.10, 13.04) on your
-nodes. For additional details on operating systems or to use other operating
-systems other than Debian or Ubuntu, see `OS Recommendations`_.
-
-
-Install an SSH Server
-=====================
-
-The ``ceph-deploy`` utility requires ``ssh``, so your server node(s) require an
-SSH server. ::
-
- sudo apt-get install openssh-server
-
-
-Create a User
-=============
-
-Create a user on nodes running Ceph daemons.
-
-.. tip:: We recommend a username that brute force attackers won't
- guess easily (e.g., something other than ``root``, ``ceph``, etc).
-
-::
+Thank you for trying Ceph! We recommend setting up a ``ceph-deploy`` admin node
+and a 3-node :term:`Ceph Storage Cluster` to explore the basics of Ceph. This
+**Preflight Checklist** will help you prepare a ``ceph-deploy`` admin node and
+three Ceph Nodes (or virtual machines) that will host your Ceph Storage Cluster.
+
+
+.. ditaa::
+ /------------------\ /----------------\
+ | Admin Node | | ceph–node1 |
+ | +-------->+ |
+ | ceph–deploy | | cCCC |
+ \---------+--------/ \----------------/
+ |
+ | /----------------\
+ | | ceph–node2 |
+ +----------------->+ |
+ | | cCCC |
+ | \----------------/
+ |
+ | /----------------\
+ | | ceph–node3 |
+ +----------------->| |
+ | cCCC |
+ \----------------/
+
+
+Ceph Node Setup
+===============
+
+Perform the following steps:
+
+#. Create a user on each Ceph Node. ::
ssh user@ceph-server
sudo useradd -d /home/ceph -m ceph
sudo passwd ceph
-
-``ceph-deploy`` installs packages onto your nodes. This means that
-the user you create requires passwordless ``sudo`` privileges.
-
-.. note:: We **DO NOT** recommend enabling the ``root`` password
- for security reasons.
-
-To provide full privileges to the user, add the following to
-``/etc/sudoers.d/ceph``. ::
+#. Add ``root`` privileges for the user on each Ceph Node. ::
echo "ceph ALL = (root) NOPASSWD:ALL" | sudo tee /etc/sudoers.d/ceph
sudo chmod 0440 /etc/sudoers.d/ceph
-Configure SSH
-=============
+#. Install an SSH server (if necessary)::
-Configure your admin machine with password-less SSH access to each node
-running Ceph daemons (leave the passphrase empty). ::
+ sudo apt-get install openssh-server
+ sudo yum install openssh-server
+
+
+#. Configure your ``ceph-deploy`` admin node with password-less SSH access to
+ each Ceph Node. Leave the passphrase empty::
ssh-keygen
Generating public/private key pair.
@@ -81,77 +64,95 @@ running Ceph daemons (leave the passphrase empty). ::
Your identification has been saved in /ceph-client/.ssh/id_rsa.
Your public key has been saved in /ceph-client/.ssh/id_rsa.pub.
-Copy the key to each node running Ceph daemons::
+#. Copy the key to each Ceph Node. ::
ssh-copy-id ceph@ceph-server
-Modify your ~/.ssh/config file of your admin node so that it defaults
-to logging in as the user you created when no username is specified. ::
+
+#. Modify the ``~/.ssh/config`` file of your ``ceph-deploy`` admin node so that
+ it logs in to Ceph Nodes as the user you created (e.g., ``ceph``). ::
Host ceph-server
- Hostname ceph-server.fqdn-or-ip-address.com
- User ceph
+ Hostname ceph-server.fqdn-or-ip-address.com
+ User ceph
+
+
+#. Ensure connectivity using ``ping`` with hostnames (i.e., not IP addresses).
+ Address hostname resolution issues and firewall issues as necessary.
-.. note:: Do not call ceph-deploy with ``sudo`` or run as ``root`` if you are
- login in as a different user (as in the ssh config above) because it
- will not issue ``sudo`` commands needed on the remote host.
-Install ceph-deploy
-===================
+Ceph Deploy Setup
+=================
-To install ``ceph-deploy``, execute the following::
+Add Ceph repositories to the ``ceph-deploy`` admin node. Then, install
+``ceph-deploy``.
+
+.. important:: Do not call ``ceph-deploy`` with ``sudo`` or run it as ``root``
+ if you are logged in as a different user, because it will not issue ``sudo``
+ commands needed on the remote host.
+
+
+Advanced Package Tool (APT)
+---------------------------
+
+For Debian and Ubuntu distributions, perform the following steps:
+
+#. Add the release key::
wget -q -O- 'https://ceph.com/git/?p=ceph.git;a=blob_plain;f=keys/release.asc' | sudo apt-key add -
echo deb http://ceph.com/debian-dumpling/ $(lsb_release -sc) main | sudo tee /etc/apt/sources.list.d/ceph.list
sudo apt-get update
sudo apt-get install ceph-deploy
+#. Add the Ceph packages to your repository. Replace ``{ceph-stable-release}``
+ with a stable Ceph release (e.g., ``cuttlefish``, ``dumpling``, etc.).
+ For example::
+
+ echo deb http://ceph.com/debian-{ceph-stable-release}/ $(lsb_release -sc) main | sudo tee /etc/apt/sources.list.d/ceph.list
-Ensure Connectivity
-===================
+#. Update your repository and install ``ceph-deploy``::
-Ensure that your admin node has connectivity to the network and to your Server
-node (e.g., ensure ``iptables``, ``ufw`` or other tools that may prevent
-connections, traffic forwarding, etc. to allow what you need).
+ sudo apt-get update && sudo apt-get install ceph-deploy
-.. tip:: The ``ceph-deploy`` tool is new and you may encounter some issues
- without effective error messages.
-Once you have completed this pre-flight checklist, you are ready to begin using
-``ceph-deploy``.
+Red Hat Package Manager (RPM)
+-----------------------------
+For Red Hat(rhel6), CentOS (el6), Fedora 17-19 (f17-f19), OpenSUSE 12
+(opensuse12), and SLES (sles11) perform the following steps:
-Hostname Resolution
-===================
+#. Add the package to your repository. Open a text editor and create a
+ Yellowdog Updater, Modified (YUM) entry. Use the file path
+ ``/etc/yum.repos.d/ceph.repo``. For example::
-Ensure that your admin node can resolve the server node's hostname. ::
+ sudo vim /etc/yum.repos.d/ceph.repo
- ping {server-node}
+ Paste the following example code. Replace ``{ceph-stable-release}`` with
+ the recent stable release of Ceph (e.g., ``dumpling``). Replace ``{distro}``
+ with your Linux distribution (e.g., ``el6`` for CentOS 6, ``rhel6`` for
+ Red Hat 6, ``fc18`` or ``fc19`` for Fedora 18 or Fedora 19, and ``sles11``
+ for SLES 11). Finally, save the contents to the
+ ``/etc/yum.repos.d/ceph.repo`` file. ::
-If you execute ``ceph-deploy`` against the localhost, ``ceph-deploy``
-must be able to resolve its IP address. Consider adding the IP address
-to your ``/etc/hosts`` file such that it resolves to the hostname. ::
+ [ceph-noarch]
+ name=Ceph noarch packages
+ baseurl=http://ceph.com/rpm-{ceph-stable-release}/{distro}/noarch
+ enabled=1
+ gpgcheck=1
+ type=rpm-md
+ gpgkey=https://ceph.com/git/?p=ceph.git;a=blob_plain;f=keys/release.asc
- hostname
- host -4 {hostname}
- sudo vim /etc/hosts
- {ip-address} {hostname}
+#. Update your repository and install ``ceph-deploy``::
- ceph-deploy {command} {hostname}
+ sudo yum update && sudo yum install ceph-deploy
-.. tip:: The ``ceph-deploy`` tool will not resolve to ``localhost``. Use
- the hostname.
Summary
=======
-Once you have passwordless ``ssh`` connectivity, passwordless ``sudo``,
-installed ``ceph-deploy``, and you have ensured appropriate connectivity,
-proceed to the `Storage Cluster Quick Start`_.
-
-.. tip:: The ``ceph-deploy`` utility can install Ceph packages on remote
- machines from the admin node!
+This completes the Quick Start Preflight. Proceed to the `Storage Cluster
+Quick Start`_.
.. _Storage Cluster Quick Start: ../quick-ceph-deploy
.. _OS Recommendations: ../../install/os-recommendations
diff --git a/fusetrace/fusetrace_ll.cc b/fusetrace/fusetrace_ll.cc
index eb7100a867f..7f2b8438f1f 100644
--- a/fusetrace/fusetrace_ll.cc
+++ b/fusetrace/fusetrace_ll.cc
@@ -11,7 +11,7 @@
gcc -Wall `pkg-config fuse --cflags --libs` -lulockmgr fusexmp_fh.c -o fusexmp_fh
*/
-#define FUSE_USE_VERSION 26
+#define FUSE_USE_VERSION 30
#ifdef HAVE_CONFIG_H
#include <config.h>
diff --git a/man/rbd.8 b/man/rbd.8
index 27a74aaa19a..88048674614 100644
--- a/man/rbd.8
+++ b/man/rbd.8
@@ -148,6 +148,11 @@ Specifies output formatting (default: plain, json, xml)
.B \-\-pretty\-format
Make json or xml formatted output more human\-readable.
.UNINDENT
+.INDENT 0.0
+.TP
+.B \-\-read\-only
+Set device readonly when mapping image.
+.UNINDENT
.SH COMMANDS
.INDENT 0.0
.TP
diff --git a/qa/run_xfstests.sh b/qa/run_xfstests.sh
index f3dffca293f..f9c3e55a79d 100644
--- a/qa/run_xfstests.sh
+++ b/qa/run_xfstests.sh
@@ -276,6 +276,9 @@ function install_xfstests() {
cd xfstests
+ # FIXME: use an older version before the tests were rearranged!
+ git reset --hard e5f1a13792f20cfac097fef98007610b422f2cac
+
ncpu=$(getconf _NPROCESSORS_ONLN 2>&1)
[ -n "${ncpu}" -a "${ncpu}" -gt 1 ] && multiple="-j ${ncpu}"
diff --git a/qa/workunits/cephtool/test.sh b/qa/workunits/cephtool/test.sh
index 51420a2f134..09e55b9a842 100755
--- a/qa/workunits/cephtool/test.sh
+++ b/qa/workunits/cephtool/test.sh
@@ -169,7 +169,16 @@ bl=192.168.0.1:0/1000
ceph osd blacklist add $bl
ceph osd blacklist ls | grep $bl
ceph osd blacklist rm $bl
-expect_false "(ceph osd blacklist ls | grep $bl)"
+expect_false "ceph osd blacklist ls | grep $bl"
+
+bl=192.168.0.1
+# test without nonce, invalid nonce
+ceph osd blacklist add $bl
+ceph osd blacklist ls | grep $bl
+ceph osd blacklist rm $bl
+expect_false "ceph osd blacklist ls | grep $bl"
+expect_false "ceph osd blacklist $bl/-1"
+expect_false "ceph osd blacklist $bl/foo"
ceph osd crush tunables legacy
ceph osd crush tunables bobtail
diff --git a/qa/workunits/mon/crush_ops.sh b/qa/workunits/mon/crush_ops.sh
index 09e49acfbf6..f1770e171eb 100755
--- a/qa/workunits/mon/crush_ops.sh
+++ b/qa/workunits/mon/crush_ops.sh
@@ -68,4 +68,13 @@ ceph osd crush add-bucket foo host
ceph osd crush move foo root=default rack=localrack
ceph osd crush rm foo
+# test reweight
+o3=`ceph osd create`
+ceph osd crush add $o3 123 root=default
+ceph osd tree | grep osd.$o3 | grep 123
+ceph osd crush reweight osd.$o3 113
+ceph osd tree | grep osd.$o3 | grep 113
+ceph osd crush rm osd.$o3
+ceph osd rm osd.$o3
+
echo OK
diff --git a/qa/workunits/rados/test_tmap_to_omap.sh b/qa/workunits/rados/test_tmap_to_omap.sh
new file mode 100755
index 00000000000..76656ad726b
--- /dev/null
+++ b/qa/workunits/rados/test_tmap_to_omap.sh
@@ -0,0 +1,28 @@
+#!/bin/sh -ex
+
+expect_false()
+{
+ set -x
+ if "$@"; then return 1; else return 0; fi
+}
+
+pool="pool-$$"
+rados mkpool $pool
+
+rados -p $pool tmap set foo key1 value1
+rados -p $pool tmap set foo key2 value2
+rados -p $pool tmap set foo key2 value2
+rados -p $pool tmap dump foo | grep key1
+rados -p $pool tmap dump foo | grep key2
+rados -p $pool tmap-to-omap foo
+expect_false rados -p $pool tmap dump foo
+expect_false rados -p $pool tmap dump foo
+
+rados -p $pool listomapkeys foo | grep key1
+rados -p $pool listomapkeys foo | grep key2
+rados -p $pool getomapval foo key1 | grep value1
+rados -p $pool getomapval foo key2 | grep value2
+
+rados rmpool $pool $pool --yes-i-really-really-mean-it
+
+echo OK
diff --git a/qa/workunits/rbd/copy.sh b/qa/workunits/rbd/copy.sh
index 8430fca7665..7abb3956c88 100755
--- a/qa/workunits/rbd/copy.sh
+++ b/qa/workunits/rbd/copy.sh
@@ -109,8 +109,8 @@ test_ls() {
rbd ls | grep test2
rbd ls | wc -l | grep 2
# look for fields in output of ls -l without worrying about space
- rbd ls -l | grep 'test1.*1024K.*1'
- rbd ls -l | grep 'test2.*1024K.*1'
+ rbd ls -l | grep 'test1.*1024k.*1'
+ rbd ls -l | grep 'test2.*1024k.*1'
rbd rm test1
rbd rm test2
@@ -120,8 +120,8 @@ test_ls() {
rbd ls | grep test1
rbd ls | grep test2
rbd ls | wc -l | grep 2
- rbd ls -l | grep 'test1.*1024K.*2'
- rbd ls -l | grep 'test2.*1024K.*2'
+ rbd ls -l | grep 'test1.*1024k.*2'
+ rbd ls -l | grep 'test2.*1024k.*2'
rbd rm test1
rbd rm test2
@@ -131,8 +131,8 @@ test_ls() {
rbd ls | grep test1
rbd ls | grep test2
rbd ls | wc -l | grep 2
- rbd ls -l | grep 'test1.*1024K.*2'
- rbd ls -l | grep 'test2.*1024K.*1'
+ rbd ls -l | grep 'test1.*1024k.*2'
+ rbd ls -l | grep 'test2.*1024k.*1'
remove_images
# test that many images can be shown by ls
diff --git a/qa/workunits/rbd/import_export.sh b/qa/workunits/rbd/import_export.sh
index 353a47fffbe..1813f7a9a88 100755
--- a/qa/workunits/rbd/import_export.sh
+++ b/qa/workunits/rbd/import_export.sh
@@ -66,7 +66,7 @@ dd if=/dev/urandom bs=1M count=1 of=/tmp/sparse2; truncate /tmp/sparse2 -s 2M
# 1M sparse, 1M data
rbd import $RBD_CREATE_ARGS --order 20 /tmp/sparse1
-rbd ls -l | grep sparse1 | grep '2048K'
+rbd ls -l | grep sparse1 | grep '2048k'
[ "$(objects sparse1)" = '1' ]
# export, compare contents and on-disk size
@@ -77,7 +77,7 @@ rbd rm sparse1
# 1M data, 1M sparse
rbd import $RBD_CREATE_ARGS --order 20 /tmp/sparse2
-rbd ls -l | grep sparse2 | grep '2048K'
+rbd ls -l | grep sparse2 | grep '2048k'
[ "$(objects sparse2)" = '0' ]
rbd export sparse2 /tmp/sparse2.out
compare_files_and_ondisk_sizes /tmp/sparse2 /tmp/sparse2.out
@@ -88,7 +88,7 @@ rbd rm sparse2
truncate /tmp/sparse1 -s 10M
# import from stdin just for fun, verify still sparse
rbd import $RBD_CREATE_ARGS --order 20 - sparse1 < /tmp/sparse1
-rbd ls -l | grep sparse1 | grep '10240K'
+rbd ls -l | grep sparse1 | grep '10240k'
[ "$(objects sparse1)" = '1' ]
rbd export sparse1 /tmp/sparse1.out
compare_files_and_ondisk_sizes /tmp/sparse1 /tmp/sparse1.out
@@ -99,7 +99,7 @@ rbd rm sparse1
dd if=/dev/urandom bs=2M count=1 of=/tmp/sparse2 oflag=append conv=notrunc
# again from stding
rbd import $RBD_CREATE_ARGS --order 20 - sparse2 < /tmp/sparse2
-rbd ls -l | grep sparse2 | grep '4096K'
+rbd ls -l | grep sparse2 | grep '4096k'
[ "$(objects sparse2)" = '0 2 3' ]
rbd export sparse2 /tmp/sparse2.out
compare_files_and_ondisk_sizes /tmp/sparse2 /tmp/sparse2.out
diff --git a/qa/workunits/snaps/snap-rm-diff.sh b/qa/workunits/snaps/snap-rm-diff.sh
index 8dff54f58b8..3d30dc7937a 100755
--- a/qa/workunits/snaps/snap-rm-diff.sh
+++ b/qa/workunits/snaps/snap-rm-diff.sh
@@ -1,5 +1,6 @@
#!/bin/sh -ex
+ceph mds set allow_new_snaps --yes-i-really-mean-it
wget -q http://ceph.com/qa/linux-2.6.33.tar.bz2
mkdir foo
cp linux* foo
diff --git a/qa/workunits/snaps/snaptest-0.sh b/qa/workunits/snaps/snaptest-0.sh
index 93e747af7dd..366249e7d25 100755
--- a/qa/workunits/snaps/snaptest-0.sh
+++ b/qa/workunits/snaps/snaptest-0.sh
@@ -1,7 +1,16 @@
#!/bin/sh -x
+expect_failure() {
+ if [ `"$@"` -e 0 ]; then
+ return 1
+ fi
+ return 0
+}
set -e
+expect_failure mkdir .snap/foo
+ceph mds set allow_new_snaps --yes-i-really-mean-it
+
echo asdf > foo
mkdir .snap/foo
grep asdf .snap/foo/foo
@@ -14,4 +23,7 @@ grep asdf .snap/bar/bar
rmdir .snap/bar
rm foo
+ceph mds unset allow_new_snaps --yes-i-really-mean-it
+expect_failure mkdir .snap/baz
+
echo OK \ No newline at end of file
diff --git a/qa/workunits/snaps/snaptest-1.sh b/qa/workunits/snaps/snaptest-1.sh
index 59d41ef688f..7c528dd432a 100755
--- a/qa/workunits/snaps/snaptest-1.sh
+++ b/qa/workunits/snaps/snaptest-1.sh
@@ -2,6 +2,8 @@
set -e
+ceph mds set allow_new_snaps --yes-i-really-mean-it
+
echo 1 > file1
echo 2 > file2
echo 3 > file3
diff --git a/qa/workunits/snaps/snaptest-2.sh b/qa/workunits/snaps/snaptest-2.sh
index 4b67999921c..b73bf9cb97f 100755
--- a/qa/workunits/snaps/snaptest-2.sh
+++ b/qa/workunits/snaps/snaptest-2.sh
@@ -1,5 +1,7 @@
#!/bin/bash
+ceph mds set allow_new_snaps --yes-i-really-mean-it
+
echo "Create dir 100 to 199 ..."
for i in $(seq 100 199); do
echo " create dir $i"
diff --git a/qa/workunits/snaps/snaptest-authwb.sh b/qa/workunits/snaps/snaptest-authwb.sh
index 128efb70d19..acbb599bda9 100755
--- a/qa/workunits/snaps/snaptest-authwb.sh
+++ b/qa/workunits/snaps/snaptest-authwb.sh
@@ -2,6 +2,8 @@
set -e
+ceph mds set allow_new_snaps --yes-i-really-mean-it
+
touch foo
chmod +x foo
mkdir .snap/s
diff --git a/qa/workunits/snaps/snaptest-capwb.sh b/qa/workunits/snaps/snaptest-capwb.sh
index 8c5a1333b69..9d0568cb6db 100755
--- a/qa/workunits/snaps/snaptest-capwb.sh
+++ b/qa/workunits/snaps/snaptest-capwb.sh
@@ -4,6 +4,8 @@ set -e
mkdir foo
+ceph mds set allow_new_snaps --yes-i-really-mean-it
+
# make sure mds handles it when the client does not send flushsnap
echo x > foo/x
sync
diff --git a/qa/workunits/snaps/snaptest-dir-rename.sh b/qa/workunits/snaps/snaptest-dir-rename.sh
index e81edf9c47f..6995f537a47 100755
--- a/qa/workunits/snaps/snaptest-dir-rename.sh
+++ b/qa/workunits/snaps/snaptest-dir-rename.sh
@@ -2,6 +2,8 @@
set -e
+ceph mds set allow_new_snaps --yes-i-really-mean-it
+
#
# make sure we keep an existing dn's seq
#
diff --git a/qa/workunits/snaps/snaptest-double-null.sh b/qa/workunits/snaps/snaptest-double-null.sh
index cdf32e4f0ef..5a673ff9c0d 100755
--- a/qa/workunits/snaps/snaptest-double-null.sh
+++ b/qa/workunits/snaps/snaptest-double-null.sh
@@ -2,6 +2,8 @@
set -e
+ceph mds set allow_new_snaps --yes-i-really-mean-it
+
# multiple intervening snapshots with no modifications, and thus no
# snapflush client_caps messages. make sure the mds can handle this.
diff --git a/qa/workunits/snaps/snaptest-estale.sh b/qa/workunits/snaps/snaptest-estale.sh
index a4fb94368d4..31ba5a87659 100755
--- a/qa/workunits/snaps/snaptest-estale.sh
+++ b/qa/workunits/snaps/snaptest-estale.sh
@@ -1,5 +1,7 @@
#!/bin/sh -x
+ceph mds set allow_new_snaps --yes-i-really-mean-it
+
mkdir .snap/foo
echo "We want ENOENT, not ESTALE, here."
diff --git a/qa/workunits/snaps/snaptest-git-ceph.sh b/qa/workunits/snaps/snaptest-git-ceph.sh
index 11532d8b14b..71a71e1d469 100755
--- a/qa/workunits/snaps/snaptest-git-ceph.sh
+++ b/qa/workunits/snaps/snaptest-git-ceph.sh
@@ -2,6 +2,8 @@
set -e
+ceph mds set allow_new_snaps --yes-i-really-mean-it
+
git clone git://ceph.com/git/ceph.git
cd ceph
diff --git a/qa/workunits/snaps/snaptest-intodir.sh b/qa/workunits/snaps/snaptest-intodir.sh
index 3cbbe01718e..d022cfd479e 100755
--- a/qa/workunits/snaps/snaptest-intodir.sh
+++ b/qa/workunits/snaps/snaptest-intodir.sh
@@ -1,5 +1,7 @@
#!/bin/sh -ex
+ceph mds set allow_new_snaps --yes-i-really-mean-it
+
# this tests fix for #1399
mkdir foo
mkdir foo/.snap/one
diff --git a/qa/workunits/snaps/snaptest-multiple-capsnaps.sh b/qa/workunits/snaps/snaptest-multiple-capsnaps.sh
index 5ebc852cf6c..d88722bde09 100755
--- a/qa/workunits/snaps/snaptest-multiple-capsnaps.sh
+++ b/qa/workunits/snaps/snaptest-multiple-capsnaps.sh
@@ -2,6 +2,8 @@
set -e
+ceph mds set allow_new_snaps --yes-i-really-mean-it
+
echo asdf > a
mkdir .snap/1
chmod 777 a
diff --git a/qa/workunits/snaps/snaptest-parents.sh b/qa/workunits/snaps/snaptest-parents.sh
index 7e5241a27c0..8963f628dc8 100644
--- a/qa/workunits/snaps/snaptest-parents.sh
+++ b/qa/workunits/snaps/snaptest-parents.sh
@@ -2,6 +2,8 @@
set -e
+ceph mds set allow_new_snaps --yes-i-really-mean-it
+
echo "making directory tree and files"
mkdir -p 1/a/b/c/
echo "i'm file1" > 1/a/file1
diff --git a/qa/workunits/snaps/snaptest-snap-rm-cmp.sh b/qa/workunits/snaps/snaptest-snap-rm-cmp.sh
index aa094e70789..68ecf37b73e 100755
--- a/qa/workunits/snaps/snaptest-snap-rm-cmp.sh
+++ b/qa/workunits/snaps/snaptest-snap-rm-cmp.sh
@@ -2,6 +2,8 @@
set -e
+ceph mds set allow_new_snaps --yes-i-really-mean-it
+
file=linux-2.6.33.tar.bz2
wget -q http://ceph.com/qa/$file
diff --git a/qa/workunits/snaps/snaptest-upchildrealms.sh b/qa/workunits/snaps/snaptest-upchildrealms.sh
index 63b7167b42d..b5b8830e9f0 100755
--- a/qa/workunits/snaps/snaptest-upchildrealms.sh
+++ b/qa/workunits/snaps/snaptest-upchildrealms.sh
@@ -2,6 +2,8 @@
set -e
+ceph mds set allow_new_snaps --yes-i-really-mean-it
+
#
# verify that a snap update on a parent realm will induce
# snap cap writeback for inodes child realms
diff --git a/qa/workunits/snaps/snaptest-xattrwb.sh b/qa/workunits/snaps/snaptest-xattrwb.sh
index b2dd7bc748a..c36e2575845 100755
--- a/qa/workunits/snaps/snaptest-xattrwb.sh
+++ b/qa/workunits/snaps/snaptest-xattrwb.sh
@@ -2,6 +2,8 @@
set -e
+ceph mds set allow_new_snaps --yes-i-really-mean-it
+
echo "testing simple xattr wb"
touch x
setfattr -n user.foo x
diff --git a/qa/workunits/snaps/untar_snap_rm.sh b/qa/workunits/snaps/untar_snap_rm.sh
index 5c71212df75..89e2db0cd10 100755
--- a/qa/workunits/snaps/untar_snap_rm.sh
+++ b/qa/workunits/snaps/untar_snap_rm.sh
@@ -2,6 +2,8 @@
set -e
+ceph mds set allow_new_snaps --yes-i-really-mean-it
+
do_tarball() {
wget http://ceph.com/qa/$1
tar xvf$2 $1
diff --git a/qa/workunits/suites/fsstress.sh b/qa/workunits/suites/fsstress.sh
index 7f945172687..394e5fad991 100755
--- a/qa/workunits/suites/fsstress.sh
+++ b/qa/workunits/suites/fsstress.sh
@@ -2,6 +2,7 @@
if [ ! -f /usr/lib/ltp/testcases/bin/fsstress ]
then
+ path=`pwd`
mkdir -p /tmp/fsstress
cd /tmp/fsstress
wget -q -O /tmp/fsstress/ltp-full.tgz http://ceph.com/qa/ltp-full-20091231.tgz
@@ -13,6 +14,7 @@ then
sudo cp -avf /tmp/fsstress/ltp-full-20091231/testcases/kernel/fs/fsstress/fsstress /usr/lib/ltp/testcases/bin/fsstress
sudo chmod 755 /usr/lib/ltp/testcases/bin/fsstress
rm -Rf /tmp/fsstress
+ cd $path
fi
command="/usr/lib/ltp/testcases/bin/fsstress -d fsstress-`hostname`$$ -l 1 -n 1000 -p 10 -v"
diff --git a/src/.gitignore b/src/.gitignore
index 4c98529bd87..6efe8dc6bc4 100644
--- a/src/.gitignore
+++ b/src/.gitignore
@@ -68,6 +68,7 @@ Makefile
/test_*
/cls_test_*
/unittest_*
+/get_command_descriptions
# old dir, may in use by older branches
/leveldb
diff --git a/src/Makefile-env.am b/src/Makefile-env.am
index 900998702f5..6a4e09512a2 100644
--- a/src/Makefile-env.am
+++ b/src/Makefile-env.am
@@ -8,6 +8,7 @@ CLEANFILES =
noinst_HEADERS =
bin_PROGRAMS =
+noinst_PROGRAMS =
bin_SCRIPTS =
sbin_PROGRAMS =
sbin_SCRIPTS =
@@ -26,6 +27,12 @@ ceph_sbindir = $(exec_prefix)$(sbindir)
# C/C++ tests to build will be appended to this
check_PROGRAMS =
+# tests scripts will be appended to this
+check_SCRIPTS =
+
+# python unit tests need to know where the scripts are located
+export PYTHONPATH=$(top_srcdir)/src/pybind
+
# when doing a debug build, make sure to make the targets
if WITH_DEBUG
bin_PROGRAMS += $(bin_DEBUGPROGRAMS)
diff --git a/src/Makefile.am b/src/Makefile.am
index ed07a91e3ae..280b268479e 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -251,10 +251,11 @@ shell_scripts += init-ceph mkcephfs
# executables built, you need to replace this with manual assignments
# target by target
-TESTS = $(check_PROGRAMS) unittest_bufferlist.sh
+TESTS = \
+ $(check_PROGRAMS) \
+ $(check_SCRIPTS)
check-local:
- $(srcdir)/test/encoding/check-generated.sh
$(srcdir)/test/encoding/readable.sh ../ceph-object-corpus
@@ -294,12 +295,12 @@ CLEANFILES += ceph_ver.h sample.fetch_config
ceph: ceph.in ./ceph_ver.h Makefile
rm -f $@ $@.tmp
- echo "#!/usr/bin/python" >$@.tmp
+ echo "#!/usr/bin/env python" >$@.tmp
grep "#define CEPH_GIT_NICE_VER" ./ceph_ver.h | \
sed -e 's/#define \(.*VER\) /\1=/' >>$@.tmp
grep "#define CEPH_GIT_VER" ./ceph_ver.h | \
sed -e 's/#define \(.*VER\) /\1=/' -e 's/=\(.*\)$$/="\1"/' >>$@.tmp
- cat $@.in >>$@.tmp
+ cat $(srcdir)/$@.in >>$@.tmp
chmod a+x $@.tmp
chmod a-w $@.tmp
mv $@.tmp $@
diff --git a/src/arch/intel.c b/src/arch/intel.c
index 0513da53c23..8b2d2ccab12 100644
--- a/src/arch/intel.c
+++ b/src/arch/intel.c
@@ -4,8 +4,7 @@
int ceph_arch_intel_sse42 = 0;
-/* this probably isn't specific enough for x86_64? fix me someday */
-#ifdef __LP64__
+#ifdef __x86_64__
/* intel cpu? */
static void do_cpuid(unsigned int *eax, unsigned int *ebx, unsigned int *ecx,
@@ -35,7 +34,7 @@ int ceph_arch_intel_probe(void)
return 0;
}
-#else // __LP64__
+#else // __x86_64__
int ceph_arch_intel_probe(void)
{
@@ -43,4 +42,4 @@ int ceph_arch_intel_probe(void)
return 0;
}
-#endif // __LP64__
+#endif // __x86_64__
diff --git a/src/ceph-create-keys b/src/ceph-create-keys
index 176b06e7a38..0359228d5f8 100755
--- a/src/ceph-create-keys
+++ b/src/ceph-create-keys
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
import argparse
import errno
import json
diff --git a/src/ceph-disk b/src/ceph-disk
index 3d09bdf7418..64d944d9db0 100755
--- a/src/ceph-disk
+++ b/src/ceph-disk
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
import argparse
import errno
@@ -570,7 +570,7 @@ def get_fsid(cluster):
fsid = get_conf(cluster=cluster, variable='fsid')
if fsid is None:
raise Error('getting cluster uuid from configuration failed')
- return fsid
+ return fsid.lower()
def get_or_create_dmcrypt_key(
@@ -888,15 +888,12 @@ def prepare_journal_dev(
def prepare_journal_file(
- journal,
- journal_size):
+ journal):
if not os.path.exists(journal):
- LOG.debug('Creating journal file %s with size %dM', journal, journal_size)
+ LOG.debug('Creating journal file %s with size 0 (ceph-osd will resize and allocate)', journal)
with file(journal, 'wb') as journal_file:
- journal_file.truncate(journal_size * 1048576)
-
- # FIXME: should we resize an existing journal file?
+ pass
LOG.debug('Journal is file %s', journal)
LOG.warning('OSD will not be hot-swappable if journal is not the same device as the osd data')
@@ -921,13 +918,13 @@ def prepare_journal(
if not os.path.exists(journal):
if force_dev:
raise Error('Journal does not exist; not a block device', journal)
- return prepare_journal_file(journal, journal_size)
+ return prepare_journal_file(journal)
jmode = os.stat(journal).st_mode
if stat.S_ISREG(jmode):
if force_dev:
raise Error('Journal is not a block device', journal)
- return prepare_journal_file(journal, journal_size)
+ return prepare_journal_file(journal)
if stat.S_ISBLK(jmode):
if force_file:
@@ -1604,6 +1601,7 @@ def find_cluster_by_uuid(_uuid):
Find a cluster name by searching /etc/ceph/*.conf for a conf file
with the right uuid.
"""
+ _uuid = _uuid.lower()
no_fsid = []
if not os.path.exists('/etc/ceph'):
return None
@@ -1611,11 +1609,15 @@ def find_cluster_by_uuid(_uuid):
if not conf_file.endswith('.conf'):
continue
cluster = conf_file[:-5]
- fsid = get_conf(cluster, 'fsid')
- if fsid is None:
+ try:
+ fsid = get_fsid(cluster)
+ except Error as e:
+ if e.message != 'getting cluster uuid from configuration failed':
+ raise e
no_fsid.append(cluster)
- elif fsid == _uuid:
- return cluster
+ else:
+ if fsid == _uuid:
+ return cluster
# be tolerant of /etc/ceph/ceph.conf without an fsid defined.
if len(no_fsid) == 1 and no_fsid[0] == 'ceph':
LOG.warning('No fsid defined in /etc/ceph/ceph.conf; using anyway')
diff --git a/src/ceph-rest-api b/src/ceph-rest-api
index ae5245b4f76..772b3d20fcd 100755
--- a/src/ceph-rest-api
+++ b/src/ceph-rest-api
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
# vim: ts=4 sw=4 smarttab expandtab
import argparse
diff --git a/src/ceph.in b/src/ceph.in
index 320e4bd413f..075ec80c20b 100755
--- a/src/ceph.in
+++ b/src/ceph.in
@@ -476,6 +476,9 @@ def complete(sigdict, args, target):
###
def main():
+ ceph_args = os.environ.get('CEPH_ARGS')
+ if ceph_args:
+ sys.argv.extend(ceph_args.split())
parser, parsed_args, childargs = parse_cmdargs()
@@ -556,7 +559,6 @@ def main():
cluster_handle = rados.Rados(name=name, clustername=clustername,
conf_defaults=conf_defaults, conffile=conffile)
- cluster_handle.conf_parse_env()
retargs = cluster_handle.conf_parse_argv(childargs)
#tmp = childargs
childargs = retargs
@@ -642,7 +644,7 @@ def main():
if parsed_args.output_file:
try:
outf = open(parsed_args.output_file, 'w')
- except:
+ except Exception as e:
print >> sys.stderr, \
'Can\'t open output file {0}: {1}'.\
format(parsed_args.output_file, e)
diff --git a/src/ceph_osd.cc b/src/ceph_osd.cc
index 3d517da1f89..2388762f1df 100644
--- a/src/ceph_osd.cc
+++ b/src/ceph_osd.cc
@@ -423,7 +423,7 @@ int main(int argc, const char **argv)
global_init_daemonize(g_ceph_context, 0);
common_init_finish(g_ceph_context);
- if (g_conf->filestore_update_to >= (int)FileStore::on_disk_version) {
+ if (g_conf->filestore_update_to >= (int)FileStore::target_version) {
int err = OSD::convertfs(g_conf->osd_data, g_conf->osd_journal);
if (err < 0) {
derr << TEXT_RED << " ** ERROR: error converting store " << g_conf->osd_data
diff --git a/src/client/Client.cc b/src/client/Client.cc
index 77fd2084cf1..60a5e4550b8 100644
--- a/src/client/Client.cc
+++ b/src/client/Client.cc
@@ -148,9 +148,12 @@ Client::Client(Messenger *m, MonClient *mc)
timer(m->cct, client_lock),
ino_invalidate_cb(NULL),
ino_invalidate_cb_handle(NULL),
+ dentry_invalidate_cb(NULL),
+ dentry_invalidate_cb_handle(NULL),
getgroups_cb(NULL),
getgroups_cb_handle(NULL),
async_ino_invalidator(m->cct),
+ async_dentry_invalidator(m->cct),
tick_event(NULL),
monclient(mc), messenger(m), whoami(m->get_myname().num()),
initialized(false), mounted(false), unmounting(false),
@@ -410,11 +413,17 @@ void Client::shutdown()
admin_socket->unregister_command("dump_cache");
if (ino_invalidate_cb) {
- ldout(cct, 10) << "shutdown stopping invalidator finisher" << dendl;
+ ldout(cct, 10) << "shutdown stopping cache invalidator finisher" << dendl;
async_ino_invalidator.wait_for_empty();
async_ino_invalidator.stop();
}
+ if (dentry_invalidate_cb) {
+ ldout(cct, 10) << "shutdown stopping dentry invalidator finisher" << dendl;
+ async_dentry_invalidator.wait_for_empty();
+ async_dentry_invalidator.stop();
+ }
+
objectcacher->stop(); // outside of client_lock! this does a join.
client_lock.Lock();
@@ -1532,7 +1541,7 @@ void Client::_closed_mds_session(MetaSession *s)
signal_context_list(s->waiting_for_open);
mount_cond.Signal();
remove_session_caps(s);
- kick_requests(s, true);
+ kick_requests_closed(s);
mds_sessions.erase(s->mds_num);
delete s;
}
@@ -1905,7 +1914,7 @@ void Client::handle_mds_map(MMDSMap* m)
if (newstate >= MDSMap::STATE_ACTIVE) {
if (oldstate < MDSMap::STATE_ACTIVE) {
- kick_requests(p->second, false);
+ kick_requests(p->second);
kick_flushing_caps(p->second);
signal_context_list(p->second->waiting_for_open);
kick_maxsize_requests(p->second);
@@ -1989,25 +1998,16 @@ void Client::send_reconnect(MetaSession *session)
}
-void Client::kick_requests(MetaSession *session, bool signal)
+void Client::kick_requests(MetaSession *session)
{
ldout(cct, 10) << "kick_requests for mds." << session->mds_num << dendl;
-
for (map<tid_t, MetaRequest*>::iterator p = mds_requests.begin();
p != mds_requests.end();
- ++p)
+ ++p) {
if (p->second->mds == session->mds_num) {
- if (signal) {
- // only signal caller if there is a caller
- // otherwise, let resend_unsafe handle it
- if (p->second->caller_cond) {
- p->second->kick = true;
- p->second->caller_cond->Signal();
- }
- } else {
- send_request(p->second, session);
- }
+ send_request(p->second, session);
}
+ }
}
void Client::resend_unsafe_requests(MetaSession *session)
@@ -2018,6 +2018,25 @@ void Client::resend_unsafe_requests(MetaSession *session)
send_request(*iter, session);
}
+void Client::kick_requests_closed(MetaSession *session)
+{
+ ldout(cct, 10) << "kick_requests_closed for mds." << session->mds_num << dendl;
+ for (map<tid_t, MetaRequest*>::iterator p = mds_requests.begin();
+ p != mds_requests.end();
+ ++p) {
+ if (p->second->mds == session->mds_num) {
+ if (p->second->caller_cond) {
+ p->second->kick = true;
+ p->second->caller_cond->Signal();
+ }
+ p->second->item.remove_myself();
+ p->second->unsafe_item.remove_myself();
+ }
+ }
+ assert(session->requests.empty());
+ assert(session->unsafe_requests.empty());
+}
+
@@ -3551,6 +3570,45 @@ void Client::handle_cap_flushsnap_ack(MetaSession *session, Inode *in, MClientCa
m->put();
}
+class C_Client_DentryInvalidate : public Context {
+private:
+ Client *client;
+ vinodeno_t dirino;
+ vinodeno_t ino;
+ string name;
+public:
+ C_Client_DentryInvalidate(Client *c, Dentry *dn) :
+ client(c), dirino(dn->dir->parent_inode->vino()),
+ ino(dn->inode->vino()), name(dn->name) { }
+ void finish(int r) {
+ client->_async_dentry_invalidate(dirino, ino, name);
+ }
+};
+
+void Client::_async_dentry_invalidate(vinodeno_t dirino, vinodeno_t ino, string& name)
+{
+ ldout(cct, 10) << "_async_dentry_invalidate '" << name << "' ino " << ino
+ << " in dir " << dirino << dendl;
+ dentry_invalidate_cb(dentry_invalidate_cb_handle, dirino, ino, name);
+}
+
+void Client::_schedule_invalidate_dentry_callback(Dentry *dn)
+{
+ if (dentry_invalidate_cb && dn->inode->ll_ref > 0)
+ async_dentry_invalidator.queue(new C_Client_DentryInvalidate(this, dn));
+}
+
+void Client::_invalidate_inode_parents(Inode *in)
+{
+ set<Dentry*>::iterator q = in->dn_set.begin();
+ while (q != in->dn_set.end()) {
+ Dentry *dn = *q++;
+ // FIXME: we play lots of unlink/link tricks when handling MDS replies,
+ // so in->dn_set doesn't always reflect the state of kernel's dcache.
+ _schedule_invalidate_dentry_callback(dn);
+ unlink(dn, false);
+ }
+}
void Client::handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, MClientCaps *m)
{
@@ -3578,8 +3636,12 @@ void Client::handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, MClient
in->uid = m->head.uid;
in->gid = m->head.gid;
}
+ bool deleted_inode = false;
if ((issued & CEPH_CAP_LINK_EXCL) == 0) {
in->nlink = m->head.nlink;
+ if (in->nlink == 0 &&
+ (new_caps & (CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL)))
+ deleted_inode = true;
}
if ((issued & CEPH_CAP_XATTR_EXCL) == 0 &&
m->xattrbl.length() &&
@@ -3633,6 +3695,10 @@ void Client::handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, MClient
if (new_caps)
signal_cond_list(in->waitfor_caps);
+ // may drop inode's last ref
+ if (deleted_inode)
+ _invalidate_inode_parents(in);
+
m->put();
}
@@ -6319,6 +6385,17 @@ void Client::ll_register_ino_invalidate_cb(client_ino_callback_t cb, void *handl
async_ino_invalidator.start();
}
+void Client::ll_register_dentry_invalidate_cb(client_dentry_callback_t cb, void *handle)
+{
+ Mutex::Locker l(client_lock);
+ ldout(cct, 10) << "ll_register_dentry_invalidate_cb cb " << (void*)cb << " p " << (void*)handle << dendl;
+ if (cb == NULL)
+ return;
+ dentry_invalidate_cb = cb;
+ dentry_invalidate_cb_handle = handle;
+ async_dentry_invalidator.start();
+}
+
void Client::ll_register_getgroups_cb(client_getgroups_callback_t cb, void *handle)
{
Mutex::Locker l(client_lock);
diff --git a/src/client/Client.h b/src/client/Client.h
index c7c9cef0e0c..df59f235de4 100644
--- a/src/client/Client.h
+++ b/src/client/Client.h
@@ -120,6 +120,9 @@ struct MetaRequest;
typedef void (*client_ino_callback_t)(void *handle, vinodeno_t ino, int64_t off, int64_t len);
+typedef void (*client_dentry_callback_t)(void *handle, vinodeno_t dirino,
+ vinodeno_t ino, string& name);
+
typedef int (*client_getgroups_callback_t)(void *handle, uid_t uid, gid_t **sgids);
// ========================================================
@@ -211,10 +214,14 @@ class Client : public Dispatcher {
client_ino_callback_t ino_invalidate_cb;
void *ino_invalidate_cb_handle;
+ client_dentry_callback_t dentry_invalidate_cb;
+ void *dentry_invalidate_cb_handle;
+
client_getgroups_callback_t getgroups_cb;
void *getgroups_cb_handle;
Finisher async_ino_invalidator;
+ Finisher async_dentry_invalidator;
Context *tick_event;
utime_t last_cap_renew;
@@ -270,7 +277,8 @@ public:
void connect_mds_targets(int mds);
void send_request(MetaRequest *request, MetaSession *session);
MClientRequest *build_client_request(MetaRequest *request);
- void kick_requests(MetaSession *session, bool signal);
+ void kick_requests(MetaSession *session);
+ void kick_requests_closed(MetaSession *session);
void handle_client_request_forward(MClientRequestForward *reply);
void handle_client_reply(MClientReply *reply);
@@ -357,6 +365,7 @@ protected:
friend class C_Client_PutInode; // calls put_inode()
friend class C_Client_CacheInvalidate; // calls ino_invalidate_cb
+ friend class C_Client_DentryInvalidate; // calls dentry_invalidate_cb
//int get_cache_size() { return lru.lru_get_size(); }
//void set_cache_size(int m) { lru.lru_set_max(m); }
@@ -459,6 +468,10 @@ protected:
void finish_cap_snap(Inode *in, CapSnap *capsnap, int used);
void _flushed_cap_snap(Inode *in, snapid_t seq);
+ void _schedule_invalidate_dentry_callback(Dentry *dn);
+ void _async_dentry_invalidate(vinodeno_t dirino, vinodeno_t ino, string& name);
+ void _invalidate_inode_parents(Inode *in);
+
void _schedule_invalidate_callback(Inode *in, int64_t off, int64_t len, bool keep_caps);
void _invalidate_inode_cache(Inode *in, bool keep_caps);
void _invalidate_inode_cache(Inode *in, int64_t off, int64_t len, bool keep_caps);
@@ -735,6 +748,8 @@ public:
void ll_register_ino_invalidate_cb(client_ino_callback_t cb, void *handle);
+ void ll_register_dentry_invalidate_cb(client_dentry_callback_t cb, void *handle);
+
void ll_register_getgroups_cb(client_getgroups_callback_t cb, void *handle);
};
diff --git a/src/client/fuse_ll.cc b/src/client/fuse_ll.cc
index 6bf5ea3d34f..88f727e454e 100644
--- a/src/client/fuse_ll.cc
+++ b/src/client/fuse_ll.cc
@@ -12,7 +12,7 @@
*
*/
-#define FUSE_USE_VERSION 26
+#define FUSE_USE_VERSION 30
#include <fuse/fuse.h>
#include <fuse/fuse_lowlevel.h>
@@ -551,7 +551,7 @@ static int getgroups_cb(void *handle, uid_t uid, gid_t **sgids)
}
#endif
-static void invalidate_cb(void *handle, vinodeno_t vino, int64_t off, int64_t len)
+static void ino_invalidate_cb(void *handle, vinodeno_t vino, int64_t off, int64_t len)
{
#if FUSE_VERSION >= FUSE_MAKE_VERSION(2, 8)
CephFuse::Handle *cfuse = (CephFuse::Handle *)handle;
@@ -560,6 +560,19 @@ static void invalidate_cb(void *handle, vinodeno_t vino, int64_t off, int64_t le
#endif
}
+static void dentry_invalidate_cb(void *handle, vinodeno_t dirino,
+ vinodeno_t ino, string& name)
+{
+ CephFuse::Handle *cfuse = (CephFuse::Handle *)handle;
+ fuse_ino_t fdirino = cfuse->make_fake_ino(dirino.ino, dirino.snapid);
+#if FUSE_VERSION >= FUSE_MAKE_VERSION(2, 9)
+ fuse_ino_t fino = cfuse->make_fake_ino(ino.ino, ino.snapid);
+ fuse_lowlevel_notify_delete(cfuse->ch, fdirino, fino, name.c_str(), name.length());
+#elif FUSE_VERSION >= FUSE_MAKE_VERSION(2, 8)
+ fuse_lowlevel_notify_inval_entry(cfuse->ch, fdirino, name.c_str(), name.length());
+#endif
+}
+
static void do_init(void *data, fuse_conn_info *bar)
{
CephFuse::Handle *cfuse = (CephFuse::Handle *)data;
@@ -743,9 +756,10 @@ int CephFuse::Handle::init(int argc, const char *argv[])
client->ll_register_getgroups_cb(getgroups_cb, this);
*/
+ client->ll_register_dentry_invalidate_cb(dentry_invalidate_cb, this);
if (client->cct->_conf->fuse_use_invalidate_cb)
- client->ll_register_ino_invalidate_cb(invalidate_cb, this);
+ client->ll_register_ino_invalidate_cb(ino_invalidate_cb, this);
done:
fuse_opt_free_args(&args);
diff --git a/src/common/Cond.h b/src/common/Cond.h
index e6a13ae48bb..46fdf159112 100644
--- a/src/common/Cond.h
+++ b/src/common/Cond.h
@@ -32,8 +32,8 @@ class Cond {
Mutex *waiter_mutex;
// don't allow copying.
- void operator=(Cond &C) {}
- Cond( const Cond &C ) {}
+ void operator=(Cond &C);
+ Cond(const Cond &C);
public:
Cond() : waiter_mutex(NULL) {
diff --git a/src/common/Makefile.am b/src/common/Makefile.am
index 4c027909b4d..9ec6c3e895b 100644
--- a/src/common/Makefile.am
+++ b/src/common/Makefile.am
@@ -4,6 +4,7 @@ libcommon_la_SOURCES = \
common/LogClient.cc \
common/LogEntry.cc \
common/PrebufferedStreambuf.cc \
+ common/SloppyCRCMap.cc \
common/BackTrace.cc \
common/perf_counters.cc \
common/Mutex.cc \
@@ -59,13 +60,17 @@ libcommon_la_SOURCES = \
common/pick_address.cc \
common/util.cc \
common/TextTable.cc \
- common/secret.c \
common/ceph_fs.cc \
common/ceph_hash.cc \
common/ceph_strings.cc \
common/ceph_frag.cc \
common/addr_parsing.c \
- common/hobject.cc
+ common/hobject.cc \
+ common/bloom_filter.cc
+
+if LINUX
+libcommon_la_SOURCES += common/secret.c
+endif
# these should go out of libcommon
libcommon_la_SOURCES += \
@@ -93,6 +98,7 @@ LIBCOMMON_DEPS += libcommon_crc.la
noinst_LTLIBRARIES += libcommon_crc.la
noinst_HEADERS += \
+ common/bloom_filter.hpp \
common/sctp_crc32.h \
common/crc32c_intel_baseline.h \
common/crc32c_intel_fast.h
@@ -117,6 +123,7 @@ noinst_HEADERS += \
common/LogClient.h \
common/LogEntry.h \
common/Preforker.h \
+ common/SloppyCRCMap.h \
common/WorkQueue.h \
common/PrioritizedQueue.h \
common/ceph_argparse.h \
diff --git a/src/common/Mutex.h b/src/common/Mutex.h
index 06e435d49df..e26a090703d 100644
--- a/src/common/Mutex.h
+++ b/src/common/Mutex.h
@@ -46,8 +46,8 @@ private:
PerfCounters *logger;
// don't allow copying.
- void operator=(Mutex &M) {}
- Mutex( const Mutex &M ) {}
+ void operator=(Mutex &M);
+ Mutex(const Mutex &M);
void _register() {
id = lockdep_register(name);
diff --git a/src/common/SloppyCRCMap.cc b/src/common/SloppyCRCMap.cc
new file mode 100644
index 00000000000..7924ae6e8a7
--- /dev/null
+++ b/src/common/SloppyCRCMap.cc
@@ -0,0 +1,180 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/SloppyCRCMap.h"
+#include "common/Formatter.h"
+
+void SloppyCRCMap::write(uint64_t offset, uint64_t len, const bufferlist& bl,
+ std::ostream *out)
+{
+ int64_t left = len;
+ uint64_t pos = offset;
+ unsigned o = offset % block_size;
+ if (o) {
+ crc_map.erase(offset - o);
+ if (out)
+ *out << "write invalidate " << (offset - o) << "\n";
+ pos += (block_size - o);
+ left -= (block_size - o);
+ }
+ while (left >= block_size) {
+ bufferlist t;
+ t.substr_of(bl, pos - offset, block_size);
+ crc_map[pos] = t.crc32c(crc_iv);
+ if (out)
+ *out << "write set " << pos << " " << crc_map[pos] << "\n";
+ pos += block_size;
+ left -= block_size;
+ }
+ if (left > 0) {
+ crc_map.erase(pos);
+ if (out)
+ *out << "write invalidate " << pos << "\n";
+ }
+}
+
+int SloppyCRCMap::read(uint64_t offset, uint64_t len, const bufferlist& bl,
+ std::ostream *err)
+{
+ int errors = 0;
+ int64_t left = len;
+ uint64_t pos = offset;
+ unsigned o = offset % block_size;
+ if (o) {
+ pos += (block_size - o);
+ left -= (block_size - o);
+ }
+ while (left >= block_size) {
+ // FIXME: this could be more efficient if we avoid doing a find()
+ // on each iteration
+ std::map<uint64_t,uint32_t>::iterator p = crc_map.find(pos);
+ if (p != crc_map.end()) {
+ bufferlist t;
+ t.substr_of(bl, pos - offset, block_size);
+ uint32_t crc = t.crc32c(crc_iv);
+ if (p->second != crc) {
+ errors++;
+ if (err)
+ *err << "offset " << pos << " len " << block_size
+ << " has crc " << crc << " expected " << p->second << "\n";
+ }
+ }
+ pos += block_size;
+ left -= block_size;
+ }
+ return errors;
+}
+
+void SloppyCRCMap::truncate(uint64_t offset)
+{
+ offset -= offset % block_size;
+ std::map<uint64_t,uint32_t>::iterator p = crc_map.lower_bound(offset);
+ while (p != crc_map.end())
+ crc_map.erase(p++);
+}
+
+void SloppyCRCMap::zero(uint64_t offset, uint64_t len)
+{
+ int64_t left = len;
+ uint64_t pos = offset;
+ unsigned o = offset % block_size;
+ if (o) {
+ crc_map.erase(offset - o);
+ pos += (block_size - o);
+ left -= (block_size - o);
+ }
+ while (left >= block_size) {
+ crc_map[pos] = zero_crc;
+ pos += block_size;
+ left -= block_size;
+ }
+ if (left > 0)
+ crc_map.erase(pos);
+}
+
+void SloppyCRCMap::clone_range(uint64_t offset, uint64_t len,
+ uint64_t srcoff, const SloppyCRCMap& src,
+ std::ostream *out)
+{
+ int64_t left = len;
+ uint64_t pos = offset;
+ uint64_t srcpos = srcoff;
+ unsigned o = offset % block_size;
+ if (o) {
+ crc_map.erase(offset - o);
+ pos += (block_size - o);
+ srcpos += (block_size - o);
+ left -= (block_size - o);
+ if (out)
+ *out << "clone_range invalidate " << (offset - o) << "\n";
+ }
+ while (left >= block_size) {
+ // FIXME: this could be more efficient.
+ if (block_size == src.block_size) {
+ map<uint64_t,uint32_t>::const_iterator p = src.crc_map.find(srcpos);
+ if (p != src.crc_map.end()) {
+ crc_map[pos] = p->second;
+ if (out)
+ *out << "clone_range copy " << pos << " " << p->second << "\n";
+ } else {
+ crc_map.erase(pos);
+ if (out)
+ *out << "clone_range invalidate " << pos << "\n";
+ }
+ } else {
+ crc_map.erase(pos);
+ if (out)
+ *out << "clone_range invalidate " << pos << "\n";
+ }
+ pos += block_size;
+ srcpos += block_size;
+ left -= block_size;
+ }
+ if (left > 0) {
+ crc_map.erase(pos);
+ if (out)
+ *out << "clone_range invalidate " << pos << "\n";
+ }
+}
+
+void SloppyCRCMap::encode(bufferlist& bl) const
+{
+ ENCODE_START(1, 1, bl);
+ ::encode(block_size, bl);
+ ::encode(crc_map, bl);
+ ENCODE_FINISH(bl);
+}
+
+void SloppyCRCMap::decode(bufferlist::iterator& bl)
+{
+ DECODE_START(1, bl);
+ uint32_t bs;
+ ::decode(bs, bl);
+ set_block_size(bs);
+ ::decode(crc_map, bl);
+ DECODE_FINISH(bl);
+}
+
+void SloppyCRCMap::dump(Formatter *f) const
+{
+ f->dump_unsigned("block_size", block_size);
+ f->open_array_section("crc_map");
+ for (map<uint64_t,uint32_t>::const_iterator p = crc_map.begin(); p != crc_map.end(); ++p) {
+ f->open_object_section("crc");
+ f->dump_unsigned("offset", p->first);
+ f->dump_unsigned("crc", p->second);
+ f->close_section();
+ }
+ f->close_section();
+}
+
+void SloppyCRCMap::generate_test_instances(list<SloppyCRCMap*>& ls)
+{
+ ls.push_back(new SloppyCRCMap);
+ ls.push_back(new SloppyCRCMap(2));
+ bufferlist bl;
+ bl.append("some data");
+ ls.back()->write(1, bl.length(), bl);
+ ls.back()->write(10, bl.length(), bl);
+ ls.back()->zero(4, 2);
+}
diff --git a/src/common/SloppyCRCMap.h b/src/common/SloppyCRCMap.h
new file mode 100644
index 00000000000..c07b4d9bb9d
--- /dev/null
+++ b/src/common/SloppyCRCMap.h
@@ -0,0 +1,78 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_COMMON_SLOPPYCRCMAP_H
+#define CEPH_COMMON_SLOPPYCRCMAP_H
+
+#include "include/types.h"
+#include "include/encoding.h"
+
+#include <map>
+#include <ostream>
+
+/**
+ * SloppyCRCMap
+ *
+ * Opportunistically track CRCs on any reads or writes that cover full
+ * blocks. Verify read results when we have CRC data available for
+ * the given extent.
+ */
+class SloppyCRCMap {
+ static const int crc_iv = 0xffffffff;
+
+ std::map<uint64_t, uint32_t> crc_map; // offset -> crc(-1)
+ uint32_t block_size;
+ uint32_t zero_crc;
+
+public:
+ SloppyCRCMap(uint32_t b=0) {
+ set_block_size(b);
+ }
+
+ void set_block_size(uint32_t b) {
+ block_size = b;
+ //zero_crc = ceph_crc32c(0xffffffff, NULL, block_size);
+ if (b) {
+ bufferlist bl;
+ bufferptr bp(block_size);
+ bp.zero();
+ bl.append(bp);
+ zero_crc = bl.crc32c(crc_iv);
+ } else {
+ zero_crc = crc_iv;
+ }
+ }
+
+ /// update based on a write
+ void write(uint64_t offset, uint64_t len, const bufferlist& bl,
+ std::ostream *out = NULL);
+
+ /// update based on a truncate
+ void truncate(uint64_t offset);
+
+ /// update based on a zero/punch_hole
+ void zero(uint64_t offset, uint64_t len);
+
+ /// update based on a zero/punch_hole
+ void clone_range(uint64_t offset, uint64_t len, uint64_t srcoff, const SloppyCRCMap& src,
+ std::ostream *out = NULL);
+
+ /**
+ * validate a read result
+ *
+ * @param offset offset
+ * @param length length
+ * @param bl data read
+ * @param err option ostream to describe errors in detail
+ * @returns error count, 0 for success
+ */
+ int read(uint64_t offset, uint64_t len, const bufferlist& bl, std::ostream *err);
+
+ void encode(bufferlist& bl) const;
+ void decode(bufferlist::iterator& bl);
+ void dump(Formatter *f) const;
+ static void generate_test_instances(list<SloppyCRCMap*>& ls);
+};
+WRITE_CLASS_ENCODER(SloppyCRCMap)
+
+#endif
diff --git a/src/common/TrackedOp.cc b/src/common/TrackedOp.cc
new file mode 100644
index 00000000000..d1dbc1e7135
--- /dev/null
+++ b/src/common/TrackedOp.cc
@@ -0,0 +1,265 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ * Copyright 2013 Inktank
+ */
+
+#include "TrackedOp.h"
+#include "common/Formatter.h"
+#include <iostream>
+#include <vector>
+#include "common/debug.h"
+#include "common/config.h"
+#include "msg/Message.h"
+#include "include/assert.h"
+
+#define dout_subsys ceph_subsys_optracker
+#undef dout_prefix
+#define dout_prefix _prefix(_dout)
+
+static ostream& _prefix(std::ostream* _dout)
+{
+ return *_dout << "-- op tracker -- ";
+}
+
+void OpHistory::on_shutdown()
+{
+ arrived.clear();
+ duration.clear();
+ shutdown = true;
+}
+
+void OpHistory::insert(utime_t now, TrackedOpRef op)
+{
+ if (shutdown)
+ return;
+ duration.insert(make_pair(op->get_duration(), op));
+ arrived.insert(make_pair(op->get_arrived(), op));
+ cleanup(now);
+}
+
+void OpHistory::cleanup(utime_t now)
+{
+ while (arrived.size() &&
+ (now - arrived.begin()->first >
+ (double)(history_duration))) {
+ duration.erase(make_pair(
+ arrived.begin()->second->get_duration(),
+ arrived.begin()->second));
+ arrived.erase(arrived.begin());
+ }
+
+ while (duration.size() > history_size) {
+ arrived.erase(make_pair(
+ duration.begin()->second->get_arrived(),
+ duration.begin()->second));
+ duration.erase(duration.begin());
+ }
+}
+
+void OpHistory::dump_ops(utime_t now, Formatter *f)
+{
+ cleanup(now);
+ f->open_object_section("OpHistory");
+ f->dump_int("num to keep", history_size);
+ f->dump_int("duration to keep", history_duration);
+ {
+ f->open_array_section("Ops");
+ for (set<pair<utime_t, TrackedOpRef> >::const_iterator i =
+ arrived.begin();
+ i != arrived.end();
+ ++i) {
+ f->open_object_section("Op");
+ i->second->dump(now, f);
+ f->close_section();
+ }
+ f->close_section();
+ }
+ f->close_section();
+}
+
+void OpTracker::dump_historic_ops(Formatter *f)
+{
+ Mutex::Locker locker(ops_in_flight_lock);
+ utime_t now = ceph_clock_now(cct);
+ history.dump_ops(now, f);
+}
+
+void OpTracker::dump_ops_in_flight(Formatter *f)
+{
+ Mutex::Locker locker(ops_in_flight_lock);
+ f->open_object_section("ops_in_flight"); // overall dump
+ f->dump_int("num_ops", ops_in_flight.size());
+ f->open_array_section("ops"); // list of TrackedOps
+ utime_t now = ceph_clock_now(cct);
+ for (xlist<TrackedOp*>::iterator p = ops_in_flight.begin(); !p.end(); ++p) {
+ f->open_object_section("op");
+ (*p)->dump(now, f);
+ f->close_section(); // this TrackedOp
+ }
+ f->close_section(); // list of TrackedOps
+ f->close_section(); // overall dump
+}
+
+void OpTracker::register_inflight_op(xlist<TrackedOp*>::item *i)
+{
+ Mutex::Locker locker(ops_in_flight_lock);
+ ops_in_flight.push_back(i);
+ ops_in_flight.back()->seq = seq++;
+}
+
+void OpTracker::unregister_inflight_op(TrackedOp *i)
+{
+ Mutex::Locker locker(ops_in_flight_lock);
+ assert(i->xitem.get_list() == &ops_in_flight);
+ utime_t now = ceph_clock_now(cct);
+ i->xitem.remove_myself();
+ i->request->clear_data();
+ history.insert(now, TrackedOpRef(i));
+}
+
+bool OpTracker::check_ops_in_flight(std::vector<string> &warning_vector)
+{
+ Mutex::Locker locker(ops_in_flight_lock);
+ if (!ops_in_flight.size())
+ return false;
+
+ utime_t now = ceph_clock_now(cct);
+ utime_t too_old = now;
+ too_old -= complaint_time;
+
+ utime_t oldest_secs = now - ops_in_flight.front()->get_arrived();
+
+ dout(10) << "ops_in_flight.size: " << ops_in_flight.size()
+ << "; oldest is " << oldest_secs
+ << " seconds old" << dendl;
+
+ if (oldest_secs < complaint_time)
+ return false;
+
+ xlist<TrackedOp*>::iterator i = ops_in_flight.begin();
+ warning_vector.reserve(log_threshold + 1);
+
+ int slow = 0; // total slow
+ int warned = 0; // total logged
+ while (!i.end() && (*i)->get_arrived() < too_old) {
+ slow++;
+
+ // exponential backoff of warning intervals
+ if (((*i)->get_arrived() +
+ (complaint_time * (*i)->warn_interval_multiplier)) < now) {
+ // will warn
+ if (warning_vector.empty())
+ warning_vector.push_back("");
+ warned++;
+ if (warned > log_threshold)
+ break;
+
+ utime_t age = now - (*i)->get_arrived();
+ stringstream ss;
+ ss << "slow request " << age << " seconds old, received at " << (*i)->get_arrived()
+ << ": " << *((*i)->request) << " currently "
+ << ((*i)->current.size() ? (*i)->current : (*i)->state_string());
+ warning_vector.push_back(ss.str());
+
+ // only those that have been shown will backoff
+ (*i)->warn_interval_multiplier *= 2;
+ }
+ ++i;
+ }
+
+ // only summarize if we warn about any. if everything has backed
+ // off, we will stay silent.
+ if (warned > 0) {
+ stringstream ss;
+ ss << slow << " slow requests, " << warned << " included below; oldest blocked for > "
+ << oldest_secs << " secs";
+ warning_vector[0] = ss.str();
+ }
+
+ return warning_vector.size();
+}
+
+void OpTracker::get_age_ms_histogram(pow2_hist_t *h)
+{
+ Mutex::Locker locker(ops_in_flight_lock);
+
+ h->clear();
+
+ utime_t now = ceph_clock_now(NULL);
+ unsigned bin = 30;
+ uint32_t lb = 1 << (bin-1); // lower bound for this bin
+ int count = 0;
+ for (xlist<TrackedOp*>::iterator i = ops_in_flight.begin(); !i.end(); ++i) {
+ utime_t age = now - (*i)->get_arrived();
+ uint32_t ms = (long)(age * 1000.0);
+ if (ms >= lb) {
+ count++;
+ continue;
+ }
+ if (count)
+ h->set(bin, count);
+ while (lb > ms) {
+ bin--;
+ lb >>= 1;
+ }
+ count = 1;
+ }
+ if (count)
+ h->set(bin, count);
+}
+
+void OpTracker::mark_event(TrackedOp *op, const string &dest)
+{
+ utime_t now = ceph_clock_now(cct);
+ return _mark_event(op, dest, now);
+}
+
+void OpTracker::_mark_event(TrackedOp *op, const string &evt,
+ utime_t time)
+{
+ Mutex::Locker locker(ops_in_flight_lock);
+ dout(5) << //"reqid: " << op->get_reqid() <<
+ ", seq: " << op->seq
+ << ", time: " << time << ", event: " << evt
+ << ", request: " << *op->request << dendl;
+}
+
+void OpTracker::RemoveOnDelete::operator()(TrackedOp *op) {
+ op->mark_event("done");
+ tracker->unregister_inflight_op(op);
+ // Do not delete op, unregister_inflight_op took control
+}
+
+void TrackedOp::mark_event(const string &event)
+{
+ utime_t now = ceph_clock_now(g_ceph_context);
+ {
+ Mutex::Locker l(lock);
+ events.push_back(make_pair(now, event));
+ }
+ tracker->mark_event(this, event);
+ _event_marked();
+}
+
+void TrackedOp::dump(utime_t now, Formatter *f) const
+{
+ Message *m = request;
+ stringstream name;
+ m->print(name);
+ f->dump_string("description", name.str().c_str()); // this TrackedOp
+ f->dump_stream("received_at") << get_arrived();
+ f->dump_float("age", now - get_arrived());
+ f->dump_float("duration", get_duration());
+ {
+ f->open_array_section("type_data");
+ _dump(now, f);
+ f->close_section();
+ }
+}
diff --git a/src/common/TrackedOp.h b/src/common/TrackedOp.h
index 753331df7f3..44e03905759 100644
--- a/src/common/TrackedOp.h
+++ b/src/common/TrackedOp.h
@@ -17,15 +17,163 @@
#include <stdint.h>
#include <include/utime.h>
#include "common/Mutex.h"
+#include "include/histogram.h"
#include "include/xlist.h"
#include "msg/Message.h"
#include <tr1/memory>
+class TrackedOp;
+typedef std::tr1::shared_ptr<TrackedOp> TrackedOpRef;
+
+class OpTracker;
+class OpHistory {
+ set<pair<utime_t, TrackedOpRef> > arrived;
+ set<pair<double, TrackedOpRef> > duration;
+ void cleanup(utime_t now);
+ bool shutdown;
+ OpTracker *tracker;
+ uint32_t history_size;
+ uint32_t history_duration;
+
+public:
+ OpHistory(OpTracker *tracker_) : shutdown(false), tracker(tracker_),
+ history_size(0), history_duration(0) {}
+ ~OpHistory() {
+ assert(arrived.empty());
+ assert(duration.empty());
+ }
+ void insert(utime_t now, TrackedOpRef op);
+ void dump_ops(utime_t now, Formatter *f);
+ void on_shutdown();
+ void set_size_and_duration(uint32_t new_size, uint32_t new_duration) {
+ history_size = new_size;
+ history_duration = new_duration;
+ }
+};
+
+class OpTracker {
+ class RemoveOnDelete {
+ OpTracker *tracker;
+ public:
+ RemoveOnDelete(OpTracker *tracker) : tracker(tracker) {}
+ void operator()(TrackedOp *op);
+ };
+ friend class RemoveOnDelete;
+ friend class OpHistory;
+ uint64_t seq;
+ Mutex ops_in_flight_lock;
+ xlist<TrackedOp *> ops_in_flight;
+ OpHistory history;
+ float complaint_time;
+ int log_threshold;
+
+public:
+ CephContext *cct;
+ OpTracker(CephContext *cct_) : seq(0), ops_in_flight_lock("OpTracker mutex"),
+ history(this), complaint_time(0), log_threshold(0), cct(cct_) {}
+ void set_complaint_and_threshold(float time, int threshold) {
+ complaint_time = time;
+ log_threshold = threshold;
+ }
+ void set_history_size_and_duration(uint32_t new_size, uint32_t new_duration) {
+ history.set_size_and_duration(new_size, new_duration);
+ }
+ void dump_ops_in_flight(Formatter *f);
+ void dump_historic_ops(Formatter *f);
+ void register_inflight_op(xlist<TrackedOp*>::item *i);
+ void unregister_inflight_op(TrackedOp *i);
+
+ void get_age_ms_histogram(pow2_hist_t *h);
+
+ /**
+ * Look for Ops which are too old, and insert warning
+ * strings for each Op that is too old.
+ *
+ * @param warning_strings A vector<string> reference which is filled
+ * with a warning string for each old Op.
+ * @return True if there are any Ops to warn on, false otherwise.
+ */
+ bool check_ops_in_flight(std::vector<string> &warning_strings);
+ void mark_event(TrackedOp *op, const string &evt);
+ void _mark_event(TrackedOp *op, const string &evt, utime_t now);
+
+ void on_shutdown() {
+ Mutex::Locker l(ops_in_flight_lock);
+ history.on_shutdown();
+ }
+ ~OpTracker() {
+ assert(ops_in_flight.empty());
+ }
+
+ template <typename T>
+ typename T::Ref create_request(Message *ref)
+ {
+ typename T::Ref retval(new T(ref, this),
+ RemoveOnDelete(this));
+
+ _mark_event(retval.get(), "header_read", ref->get_recv_stamp());
+ _mark_event(retval.get(), "throttled", ref->get_throttle_stamp());
+ _mark_event(retval.get(), "all_read", ref->get_recv_complete_stamp());
+ _mark_event(retval.get(), "dispatched", ref->get_dispatch_stamp());
+
+ retval->init_from_message();
+
+ return retval;
+ }
+};
+
class TrackedOp {
+private:
+ friend class OpHistory;
+ friend class OpTracker;
+ xlist<TrackedOp*>::item xitem;
+protected:
+ Message *request; /// the logical request we are tracking
+ OpTracker *tracker; /// the tracker we are associated with
+
+ list<pair<utime_t, string> > events; /// list of events and their times
+ Mutex lock; /// to protect the events list
+ string current; /// the current state the event is in
+ uint64_t seq; /// a unique value set by the OpTracker
+
+ uint32_t warn_interval_multiplier; // limits output of a given op warning
+
+ TrackedOp(Message *req, OpTracker *_tracker) :
+ xitem(this),
+ request(req),
+ tracker(_tracker),
+ lock("TrackedOp::lock"),
+ seq(0),
+ warn_interval_multiplier(1)
+ {
+ tracker->register_inflight_op(&xitem);
+ }
+
+ virtual void init_from_message() {}
+ /// output any type-specific data you want to get when dump() is called
+ virtual void _dump(utime_t now, Formatter *f) const {}
+ /// if you want something else to happen when events are marked, implement
+ virtual void _event_marked() {}
+
public:
- virtual void mark_event(const string &event) = 0;
- virtual ~TrackedOp() {}
+ virtual ~TrackedOp() { assert(request); request->put(); }
+
+ utime_t get_arrived() const {
+ return request->get_recv_stamp();
+ }
+ // This function maybe needs some work; assumes last event is completion time
+ double get_duration() const {
+ return events.size() ?
+ (events.rbegin()->first - get_arrived()) :
+ 0.0;
+ }
+ Message *get_req() const { return request; }
+
+ void mark_event(const string &event);
+ virtual const char *state_string() const {
+ return events.rbegin()->second.c_str();
+ }
+ void dump(utime_t now, Formatter *f) const;
};
-typedef std::tr1::shared_ptr<TrackedOp> TrackedOpRef;
#endif
diff --git a/src/common/WorkQueue.h b/src/common/WorkQueue.h
index b2742accdce..794b577a71d 100644
--- a/src/common/WorkQueue.h
+++ b/src/common/WorkQueue.h
@@ -390,6 +390,43 @@ public:
void drain(WorkQueue_* wq = 0);
};
+class GenContextWQ :
+ public ThreadPool::WorkQueueVal<GenContext<ThreadPool::TPHandle&>*> {
+ list<GenContext<ThreadPool::TPHandle&>*> _queue;
+public:
+ GenContextWQ(const string &name, time_t ti, ThreadPool *tp)
+ : ThreadPool::WorkQueueVal<
+ GenContext<ThreadPool::TPHandle&>*>(name, ti, ti*10, tp) {}
+
+ void _enqueue(GenContext<ThreadPool::TPHandle&> *c) {
+ _queue.push_back(c);
+ };
+ void _enqueue_front(GenContext<ThreadPool::TPHandle&> *c) {
+ _queue.push_front(c);
+ }
+ bool _empty() {
+ return _queue.empty();
+ }
+ GenContext<ThreadPool::TPHandle&> *_dequeue() {
+ assert(!_queue.empty());
+ GenContext<ThreadPool::TPHandle&> *c = _queue.front();
+ _queue.pop_front();
+ return c;
+ }
+ void _process(GenContext<ThreadPool::TPHandle&> *c, ThreadPool::TPHandle &tp) {
+ c->complete(tp);
+ }
+};
+class C_QueueInWQ : public Context {
+ GenContextWQ *wq;
+ GenContext<ThreadPool::TPHandle&> *c;
+public:
+ C_QueueInWQ(GenContextWQ *wq, GenContext<ThreadPool::TPHandle &> *c)
+ : wq(wq), c(c) {}
+ void finish(int) {
+ wq->queue(c);
+ }
+};
#endif
diff --git a/src/common/bloom_filter.cc b/src/common/bloom_filter.cc
new file mode 100644
index 00000000000..68875e925bf
--- /dev/null
+++ b/src/common/bloom_filter.cc
@@ -0,0 +1,137 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "include/types.h"
+#include "common/bloom_filter.hpp"
+
+void bloom_filter::encode(bufferlist& bl) const
+{
+ ENCODE_START(2, 2, bl);
+ ::encode((uint64_t)salt_count_, bl);
+ ::encode((uint64_t)insert_count_, bl);
+ ::encode((uint64_t)target_element_count_, bl);
+ ::encode((uint64_t)random_seed_, bl);
+ bufferptr bp((const char*)bit_table_, table_size_);
+ ::encode(bp, bl);
+ ENCODE_FINISH(bl);
+}
+
+void bloom_filter::decode(bufferlist::iterator& p)
+{
+ DECODE_START(2, p);
+ uint64_t v;
+ ::decode(v, p);
+ salt_count_ = v;
+ ::decode(v, p);
+ insert_count_ = v;
+ ::decode(v, p);
+ target_element_count_ = v;
+ ::decode(v, p);
+ random_seed_ = v;
+ bufferlist t;
+ ::decode(t, p);
+
+ salt_.clear();
+ generate_unique_salt();
+ table_size_ = t.length();
+ delete bit_table_;
+ if (table_size_) {
+ bit_table_ = new cell_type[table_size_];
+ t.copy(0, table_size_, (char *)bit_table_);
+ } else {
+ bit_table_ = NULL;
+ }
+
+ DECODE_FINISH(p);
+}
+
+void bloom_filter::dump(Formatter *f) const
+{
+ f->dump_unsigned("salt_count", salt_count_);
+ f->dump_unsigned("table_size", table_size_);
+ f->dump_unsigned("insert_count", insert_count_);
+ f->dump_unsigned("target_element_count", target_element_count_);
+ f->dump_unsigned("random_seed", random_seed_);
+
+ f->open_array_section("salt_table");
+ for (std::vector<bloom_type>::const_iterator i = salt_.begin(); i != salt_.end(); ++i)
+ f->dump_unsigned("salt", *i);
+ f->close_section();
+
+ f->open_array_section("bit_table");
+ for (unsigned i = 0; i < table_size_; ++i)
+ f->dump_unsigned("byte", (unsigned)bit_table_[i]);
+ f->close_section();
+}
+
+void bloom_filter::generate_test_instances(list<bloom_filter*>& ls)
+{
+ ls.push_back(new bloom_filter(10, .5, 1));
+ ls.push_back(new bloom_filter(10, .5, 1));
+ ls.back()->insert("foo");
+ ls.back()->insert("bar");
+ ls.push_back(new bloom_filter(50, .5, 1));
+ ls.back()->insert("foo");
+ ls.back()->insert("bar");
+ ls.back()->insert("baz");
+ ls.back()->insert("boof");
+ ls.back()->insert("boogggg");
+}
+
+
+void compressible_bloom_filter::encode(bufferlist& bl) const
+{
+ ENCODE_START(2, 2, bl);
+ bloom_filter::encode(bl);
+
+ uint32_t s = size_list.size();
+ ::encode(s, bl);
+ for (vector<size_t>::const_iterator p = size_list.begin();
+ p != size_list.end(); ++p)
+ ::encode((uint64_t)*p, bl);
+
+ ENCODE_FINISH(bl);
+}
+
+void compressible_bloom_filter::decode(bufferlist::iterator& p)
+{
+ DECODE_START(2, p);
+ bloom_filter::decode(p);
+
+ uint32_t s;
+ ::decode(s, p);
+ size_list.resize(s);
+ for (unsigned i = 0; i < s; i++) {
+ uint64_t v;
+ ::decode(v, p);
+ size_list[i] = v;
+ }
+
+ DECODE_FINISH(p);
+}
+
+void compressible_bloom_filter::dump(Formatter *f) const
+{
+ bloom_filter::dump(f);
+
+ f->open_array_section("table_sizes");
+ for (vector<size_t>::const_iterator p = size_list.begin();
+ p != size_list.end(); ++p)
+ f->dump_unsigned("size", (uint64_t)*p);
+ f->close_section();
+}
+
+void compressible_bloom_filter::generate_test_instances(list<compressible_bloom_filter*>& ls)
+{
+ ls.push_back(new compressible_bloom_filter(10, .5, 1));
+ ls.push_back(new compressible_bloom_filter(10, .5, 1));
+ ls.back()->insert("foo");
+ ls.back()->insert("bar");
+ ls.push_back(new compressible_bloom_filter(50, .5, 1));
+ ls.back()->insert("foo");
+ ls.back()->insert("bar");
+ ls.back()->insert("baz");
+ ls.back()->insert("boof");
+ ls.back()->compress(20);
+ ls.back()->insert("boogggg");
+}
diff --git a/src/common/bloom_filter.hpp b/src/common/bloom_filter.hpp
new file mode 100644
index 00000000000..93787a89a60
--- /dev/null
+++ b/src/common/bloom_filter.hpp
@@ -0,0 +1,700 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+/*
+ *******************************************************************
+ * *
+ * Open Bloom Filter *
+ * *
+ * Author: Arash Partow - 2000 *
+ * URL: http://www.partow.net/programming/hashfunctions/index.html *
+ * *
+ * Copyright notice: *
+ * Free use of the Open Bloom Filter Library is permitted under *
+ * the guidelines and in accordance with the most current version *
+ * of the Boost Software License, Version 1.0 *
+ * http://www.opensource.org/licenses/bsl1.0.html *
+ * *
+ *******************************************************************
+*/
+
+
+#ifndef COMMON_BLOOM_FILTER_HPP
+#define COMMON_BLOOM_FILTER_HPP
+
+#include <cstddef>
+#include <algorithm>
+#include <cmath>
+#include <limits>
+#include <list>
+#include <string>
+#include <vector>
+
+#include "include/encoding.h"
+#include "common/Formatter.h"
+
+static const std::size_t bits_per_char = 0x08; // 8 bits in 1 char(unsigned)
+static const unsigned char bit_mask[bits_per_char] = {
+ 0x01, //00000001
+ 0x02, //00000010
+ 0x04, //00000100
+ 0x08, //00001000
+ 0x10, //00010000
+ 0x20, //00100000
+ 0x40, //01000000
+ 0x80 //10000000
+};
+
+
+class bloom_filter
+{
+protected:
+
+ typedef unsigned int bloom_type;
+ typedef unsigned char cell_type;
+
+ unsigned char* bit_table_; ///< pointer to bit map
+ std::vector<bloom_type> salt_; ///< vector of salts
+ std::size_t salt_count_; ///< number of salts
+ std::size_t table_size_; ///< bit table size in bytes
+ std::size_t insert_count_; ///< insertion count
+ std::size_t target_element_count_; ///< target number of unique insertions
+ std::size_t random_seed_; ///< random seed
+
+public:
+
+ bloom_filter()
+ : bit_table_(0),
+ salt_count_(0),
+ table_size_(0),
+ insert_count_(0),
+ target_element_count_(0),
+ random_seed_(0)
+ {}
+
+ bloom_filter(const std::size_t& predicted_inserted_element_count,
+ const double& false_positive_probability,
+ const std::size_t& random_seed)
+ : bit_table_(0),
+ insert_count_(0),
+ target_element_count_(predicted_inserted_element_count),
+ random_seed_((random_seed) ? random_seed : 0xA5A5A5A5)
+ {
+ find_optimal_parameters(predicted_inserted_element_count, false_positive_probability,
+ &salt_count_, &table_size_);
+ init();
+ }
+
+ bloom_filter(const std::size_t& salt_count,
+ std::size_t table_size,
+ const std::size_t& random_seed,
+ std::size_t target_element_count)
+ : bit_table_(0),
+ salt_count_(salt_count),
+ table_size_(table_size),
+ insert_count_(0),
+ target_element_count_(target_element_count),
+ random_seed_((random_seed) ? random_seed : 0xA5A5A5A5)
+ {
+ init();
+ }
+
+ void init() {
+ generate_unique_salt();
+ if (table_size_) {
+ bit_table_ = new cell_type[table_size_];
+ std::fill_n(bit_table_, table_size_, 0x00);
+ } else {
+ bit_table_ = NULL;
+ }
+ }
+
+ bloom_filter(const bloom_filter& filter)
+ {
+ this->operator=(filter);
+ }
+
+ bloom_filter& operator = (const bloom_filter& filter)
+ {
+ if (this != &filter) {
+ salt_count_ = filter.salt_count_;
+ table_size_ = filter.table_size_;
+ insert_count_ = filter.insert_count_;
+ random_seed_ = filter.random_seed_;
+ delete[] bit_table_;
+ bit_table_ = new cell_type[table_size_];
+ std::copy(filter.bit_table_, filter.bit_table_ + table_size_, bit_table_);
+ salt_ = filter.salt_;
+ }
+ return *this;
+ }
+
+ virtual ~bloom_filter()
+ {
+ delete[] bit_table_;
+ }
+
+ inline bool operator!() const
+ {
+ return (0 == table_size_);
+ }
+
+ inline void clear()
+ {
+ if (bit_table_)
+ std::fill_n(bit_table_, table_size_, 0x00);
+ insert_count_ = 0;
+ }
+
+ /**
+ * insert a u32 into the set
+ *
+ * NOTE: the internal hash is weak enough that consecutive inputs do
+ * not achieve the desired fpp. Well-mixed values should be used
+ * here (e.g., put rjhash(x) into the filter instead of just x).
+ *
+ * @param val integer value to insert
+ */
+ inline void insert(uint32_t val) {
+ assert(bit_table_);
+ std::size_t bit_index = 0;
+ std::size_t bit = 0;
+ for (std::size_t i = 0; i < salt_.size(); ++i)
+ {
+ compute_indices(hash_ap(val,salt_[i]),bit_index,bit);
+ bit_table_[bit_index >> 3] |= bit_mask[bit];
+ }
+ ++insert_count_;
+ }
+
+ inline void insert(const unsigned char* key_begin, const std::size_t& length)
+ {
+ assert(bit_table_);
+ std::size_t bit_index = 0;
+ std::size_t bit = 0;
+ for (std::size_t i = 0; i < salt_.size(); ++i)
+ {
+ compute_indices(hash_ap(key_begin,length,salt_[i]),bit_index,bit);
+ bit_table_[bit_index >> 3] |= bit_mask[bit];
+ }
+ ++insert_count_;
+ }
+
+ template<typename T>
+ inline void insert(const T& t)
+ {
+ // Note: T must be a C++ POD type.
+ insert(reinterpret_cast<const unsigned char*>(&t),sizeof(T));
+ }
+
+ inline void insert(const std::string& key)
+ {
+ insert(reinterpret_cast<const unsigned char*>(key.c_str()),key.size());
+ }
+
+ inline void insert(const char* data, const std::size_t& length)
+ {
+ insert(reinterpret_cast<const unsigned char*>(data),length);
+ }
+
+ template<typename InputIterator>
+ inline void insert(const InputIterator begin, const InputIterator end)
+ {
+ InputIterator itr = begin;
+ while (end != itr)
+ {
+ insert(*(itr++));
+ }
+ }
+
+ /**
+ * check if a u32 is contained by set
+ *
+ * NOTE: the internal hash is weak enough that consecutive inputs do
+ * not achieve the desired fpp. Well-mixed values should be used
+ * here (e.g., put rjhash(x) into the filter instead of just x).
+ *
+ * @param val integer value to query
+ * @returns true if value is (probably) in the set, false if it definitely is not
+ */
+ inline virtual bool contains(uint32_t val) const
+ {
+ if (!bit_table_)
+ return false;
+ std::size_t bit_index = 0;
+ std::size_t bit = 0;
+ for (std::size_t i = 0; i < salt_.size(); ++i)
+ {
+ compute_indices(hash_ap(val,salt_[i]),bit_index,bit);
+ if ((bit_table_[bit_index >> 3] & bit_mask[bit]) != bit_mask[bit])
+ {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ inline virtual bool contains(const unsigned char* key_begin, const std::size_t length) const
+ {
+ if (!bit_table_)
+ return false;
+ std::size_t bit_index = 0;
+ std::size_t bit = 0;
+ for (std::size_t i = 0; i < salt_.size(); ++i)
+ {
+ compute_indices(hash_ap(key_begin,length,salt_[i]),bit_index,bit);
+ if ((bit_table_[bit_index >> 3] & bit_mask[bit]) != bit_mask[bit])
+ {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ template<typename T>
+ inline bool contains(const T& t) const
+ {
+ return contains(reinterpret_cast<const unsigned char*>(&t),static_cast<std::size_t>(sizeof(T)));
+ }
+
+ inline bool contains(const std::string& key) const
+ {
+ return contains(reinterpret_cast<const unsigned char*>(key.c_str()),key.size());
+ }
+
+ inline bool contains(const char* data, const std::size_t& length) const
+ {
+ return contains(reinterpret_cast<const unsigned char*>(data),length);
+ }
+
+ template<typename InputIterator>
+ inline InputIterator contains_all(const InputIterator begin, const InputIterator end) const
+ {
+ InputIterator itr = begin;
+ while (end != itr)
+ {
+ if (!contains(*itr))
+ {
+ return itr;
+ }
+ ++itr;
+ }
+ return end;
+ }
+
+ template<typename InputIterator>
+ inline InputIterator contains_none(const InputIterator begin, const InputIterator end) const
+ {
+ InputIterator itr = begin;
+ while (end != itr)
+ {
+ if (contains(*itr))
+ {
+ return itr;
+ }
+ ++itr;
+ }
+ return end;
+ }
+
+ inline virtual std::size_t size() const
+ {
+ return table_size_ * bits_per_char;
+ }
+
+ inline std::size_t element_count() const
+ {
+ return insert_count_;
+ }
+
+ /*
+ * density of bits set. inconvenient units, but:
+ * .3 = ~50% target insertions
+ * .5 = 100% target insertions, "perfectly full"
+ * .75 = 200% target insertions
+ * 1.0 = all bits set... infinite insertions
+ */
+ inline double density() const
+ {
+ if (!bit_table_)
+ return 0.0;
+ size_t set = 0;
+ uint8_t *p = bit_table_;
+ size_t left = table_size_;
+ while (left-- > 0) {
+ uint8_t c = *p;
+ for (; c; ++set)
+ c &= c - 1;
+ ++p;
+ }
+ return (double)set / (double)(table_size_ << 3);
+ }
+
+ virtual inline double approx_unique_element_count() const {
+ // this is not a very good estimate; a better solution should have
+ // some asymptotic behavior as density() approaches 1.0.
+ return (double)target_element_count_ * 2.0 * density();
+ }
+
+ inline double effective_fpp() const
+ {
+ /*
+ Note:
+ The effective false positive probability is calculated using the
+ designated table size and hash function count in conjunction with
+ the current number of inserted elements - not the user defined
+ predicated/expected number of inserted elements.
+ */
+ return std::pow(1.0 - std::exp(-1.0 * salt_.size() * insert_count_ / size()), 1.0 * salt_.size());
+ }
+
+ inline bloom_filter& operator &= (const bloom_filter& filter)
+ {
+ /* intersection */
+ if (
+ (salt_count_ == filter.salt_count_) &&
+ (table_size_ == filter.table_size_) &&
+ (random_seed_ == filter.random_seed_)
+ ) {
+ for (std::size_t i = 0; i < table_size_; ++i) {
+ bit_table_[i] &= filter.bit_table_[i];
+ }
+ }
+ return *this;
+ }
+
+ inline bloom_filter& operator |= (const bloom_filter& filter)
+ {
+ /* union */
+ if (
+ (salt_count_ == filter.salt_count_) &&
+ (table_size_ == filter.table_size_) &&
+ (random_seed_ == filter.random_seed_)
+ ) {
+ for (std::size_t i = 0; i < table_size_; ++i) {
+ bit_table_[i] |= filter.bit_table_[i];
+ }
+ }
+ return *this;
+ }
+
+ inline bloom_filter& operator ^= (const bloom_filter& filter)
+ {
+ /* difference */
+ if (
+ (salt_count_ == filter.salt_count_) &&
+ (table_size_ == filter.table_size_) &&
+ (random_seed_ == filter.random_seed_)
+ ) {
+ for (std::size_t i = 0; i < table_size_; ++i) {
+ bit_table_[i] ^= filter.bit_table_[i];
+ }
+ }
+ return *this;
+ }
+
+ inline const cell_type* table() const
+ {
+ return bit_table_;
+ }
+
+protected:
+
+ inline virtual void compute_indices(const bloom_type& hash, std::size_t& bit_index, std::size_t& bit) const
+ {
+ bit_index = hash % (table_size_ << 3);
+ bit = bit_index & 7;
+ }
+
+ void generate_unique_salt()
+ {
+ /*
+ Note:
+ A distinct hash function need not be implementation-wise
+ distinct. In the current implementation "seeding" a common
+ hash function with different values seems to be adequate.
+ */
+ const unsigned int predef_salt_count = 128;
+ static const bloom_type predef_salt[predef_salt_count] = {
+ 0xAAAAAAAA, 0x55555555, 0x33333333, 0xCCCCCCCC,
+ 0x66666666, 0x99999999, 0xB5B5B5B5, 0x4B4B4B4B,
+ 0xAA55AA55, 0x55335533, 0x33CC33CC, 0xCC66CC66,
+ 0x66996699, 0x99B599B5, 0xB54BB54B, 0x4BAA4BAA,
+ 0xAA33AA33, 0x55CC55CC, 0x33663366, 0xCC99CC99,
+ 0x66B566B5, 0x994B994B, 0xB5AAB5AA, 0xAAAAAA33,
+ 0x555555CC, 0x33333366, 0xCCCCCC99, 0x666666B5,
+ 0x9999994B, 0xB5B5B5AA, 0xFFFFFFFF, 0xFFFF0000,
+ 0xB823D5EB, 0xC1191CDF, 0xF623AEB3, 0xDB58499F,
+ 0xC8D42E70, 0xB173F616, 0xA91A5967, 0xDA427D63,
+ 0xB1E8A2EA, 0xF6C0D155, 0x4909FEA3, 0xA68CC6A7,
+ 0xC395E782, 0xA26057EB, 0x0CD5DA28, 0x467C5492,
+ 0xF15E6982, 0x61C6FAD3, 0x9615E352, 0x6E9E355A,
+ 0x689B563E, 0x0C9831A8, 0x6753C18B, 0xA622689B,
+ 0x8CA63C47, 0x42CC2884, 0x8E89919B, 0x6EDBD7D3,
+ 0x15B6796C, 0x1D6FDFE4, 0x63FF9092, 0xE7401432,
+ 0xEFFE9412, 0xAEAEDF79, 0x9F245A31, 0x83C136FC,
+ 0xC3DA4A8C, 0xA5112C8C, 0x5271F491, 0x9A948DAB,
+ 0xCEE59A8D, 0xB5F525AB, 0x59D13217, 0x24E7C331,
+ 0x697C2103, 0x84B0A460, 0x86156DA9, 0xAEF2AC68,
+ 0x23243DA5, 0x3F649643, 0x5FA495A8, 0x67710DF8,
+ 0x9A6C499E, 0xDCFB0227, 0x46A43433, 0x1832B07A,
+ 0xC46AFF3C, 0xB9C8FFF0, 0xC9500467, 0x34431BDF,
+ 0xB652432B, 0xE367F12B, 0x427F4C1B, 0x224C006E,
+ 0x2E7E5A89, 0x96F99AA5, 0x0BEB452A, 0x2FD87C39,
+ 0x74B2E1FB, 0x222EFD24, 0xF357F60C, 0x440FCB1E,
+ 0x8BBE030F, 0x6704DC29, 0x1144D12F, 0x948B1355,
+ 0x6D8FD7E9, 0x1C11A014, 0xADD1592F, 0xFB3C712E,
+ 0xFC77642F, 0xF9C4CE8C, 0x31312FB9, 0x08B0DD79,
+ 0x318FA6E7, 0xC040D23D, 0xC0589AA7, 0x0CA5C075,
+ 0xF874B172, 0x0CF914D5, 0x784D3280, 0x4E8CFEBC,
+ 0xC569F575, 0xCDB2A091, 0x2CC016B4, 0x5C5F4421
+ };
+
+ if (salt_count_ <= predef_salt_count)
+ {
+ std::copy(predef_salt,
+ predef_salt + salt_count_,
+ std::back_inserter(salt_));
+ for (unsigned int i = 0; i < salt_.size(); ++i)
+ {
+ /*
+ Note:
+ This is done to integrate the user defined random seed,
+ so as to allow for the generation of unique bloom filter
+ instances.
+ */
+ salt_[i] = salt_[i] * salt_[(i + 3) % salt_.size()] + random_seed_;
+ }
+ }
+ else
+ {
+ std::copy(predef_salt,predef_salt + predef_salt_count,
+ std::back_inserter(salt_));
+ srand(static_cast<unsigned int>(random_seed_));
+ while (salt_.size() < salt_count_)
+ {
+ bloom_type current_salt = static_cast<bloom_type>(rand()) * static_cast<bloom_type>(rand());
+ if (0 == current_salt)
+ continue;
+ if (salt_.end() == std::find(salt_.begin(), salt_.end(), current_salt))
+ {
+ salt_.push_back(current_salt);
+ }
+ }
+ }
+ }
+
+ static void find_optimal_parameters(std::size_t target_insert_count,
+ double target_fpp,
+ std::size_t *salt_count,
+ std::size_t *table_size)
+ {
+ /*
+ Note:
+ The following will attempt to find the number of hash functions
+ and minimum amount of storage bits required to construct a bloom
+ filter consistent with the user defined false positive probability
+ and estimated element insertion count.
+ */
+
+ double min_m = std::numeric_limits<double>::infinity();
+ double min_k = 0.0;
+ double curr_m = 0.0;
+ double k = 1.0;
+ while (k < 1000.0)
+ {
+ double numerator = (- k * target_insert_count);
+ double denominator = std::log(1.0 - std::pow(target_fpp, 1.0 / k));
+ curr_m = numerator / denominator;
+
+ if (curr_m < min_m)
+ {
+ min_m = curr_m;
+ min_k = k;
+ }
+ k += 1.0;
+ }
+
+ *salt_count = static_cast<std::size_t>(min_k);
+ size_t t = static_cast<std::size_t>(min_m);
+ t += (((t & 7) != 0) ? (bits_per_char - (t & 7)) : 0);
+ *table_size = t >> 3;
+ }
+
+ inline bloom_type hash_ap(uint32_t val, bloom_type hash) const
+ {
+ hash ^= (hash << 7) ^ ((val & 0xff000000) >> 24) * (hash >> 3);
+ hash ^= (~((hash << 11) + (((val & 0xff0000) >> 16) ^ (hash >> 5))));
+ hash ^= (hash << 7) ^ ((val & 0xff00) >> 8) * (hash >> 3);
+ hash ^= (~((hash << 11) + (((val & 0xff)) ^ (hash >> 5))));
+ return hash;
+ }
+
+ inline bloom_type hash_ap(const unsigned char* begin, std::size_t remaining_length, bloom_type hash) const
+ {
+ const unsigned char* itr = begin;
+
+ while (remaining_length >= 4)
+ {
+ hash ^= (hash << 7) ^ (*itr++) * (hash >> 3);
+ hash ^= (~((hash << 11) + ((*itr++) ^ (hash >> 5))));
+ hash ^= (hash << 7) ^ (*itr++) * (hash >> 3);
+ hash ^= (~((hash << 11) + ((*itr++) ^ (hash >> 5))));
+ remaining_length -= 4;
+ }
+
+ while (remaining_length >= 2)
+ {
+ hash ^= (hash << 7) ^ (*itr++) * (hash >> 3);
+ hash ^= (~((hash << 11) + ((*itr++) ^ (hash >> 5))));
+ remaining_length -= 2;
+ }
+
+ if (remaining_length)
+ {
+ hash ^= (hash << 7) ^ (*itr) * (hash >> 3);
+ }
+
+ return hash;
+ }
+
+public:
+ void encode(bufferlist& bl) const;
+ void decode(bufferlist::iterator& bl);
+ void dump(Formatter *f) const;
+ static void generate_test_instances(std::list<bloom_filter*>& ls);
+};
+WRITE_CLASS_ENCODER(bloom_filter)
+
+inline bloom_filter operator & (const bloom_filter& a, const bloom_filter& b)
+{
+ bloom_filter result = a;
+ result &= b;
+ return result;
+}
+
+inline bloom_filter operator | (const bloom_filter& a, const bloom_filter& b)
+{
+ bloom_filter result = a;
+ result |= b;
+ return result;
+}
+
+inline bloom_filter operator ^ (const bloom_filter& a, const bloom_filter& b)
+{
+ bloom_filter result = a;
+ result ^= b;
+ return result;
+}
+
+
+class compressible_bloom_filter : public bloom_filter
+{
+public:
+
+ compressible_bloom_filter() : bloom_filter() {}
+
+ compressible_bloom_filter(const std::size_t& predicted_element_count,
+ const double& false_positive_probability,
+ const std::size_t& random_seed)
+ : bloom_filter(predicted_element_count, false_positive_probability, random_seed)
+ {
+ size_list.push_back(table_size_);
+ }
+
+ compressible_bloom_filter(const std::size_t& salt_count,
+ std::size_t table_size,
+ const std::size_t& random_seed,
+ std::size_t target_count)
+ : bloom_filter(salt_count, table_size, random_seed, target_count)
+ {
+ size_list.push_back(table_size_);
+ }
+
+ inline virtual std::size_t size() const
+ {
+ return size_list.back() * bits_per_char;
+ }
+
+ inline bool compress(const double& target_ratio)
+ {
+ if (!bit_table_)
+ return false;
+
+ if ((0.0 >= target_ratio) || (target_ratio >= 1.0))
+ {
+ return false;
+ }
+
+ std::size_t original_table_size = size_list.back();
+ std::size_t new_table_size = static_cast<std::size_t>(size_list.back() * target_ratio);
+
+ if ((!new_table_size) || (new_table_size >= original_table_size))
+ {
+ return false;
+ }
+
+ cell_type* tmp = new cell_type[new_table_size];
+ std::copy(bit_table_, bit_table_ + (new_table_size), tmp);
+ cell_type* itr = bit_table_ + (new_table_size);
+ cell_type* end = bit_table_ + (original_table_size);
+ cell_type* itr_tmp = tmp;
+ cell_type* itr_end = tmp + (new_table_size);
+ while (end != itr)
+ {
+ *(itr_tmp++) |= (*itr++);
+ if (itr_tmp == itr_end)
+ itr_tmp = tmp;
+ }
+
+ delete[] bit_table_;
+ bit_table_ = tmp;
+ size_list.push_back(new_table_size);
+ table_size_ = new_table_size;
+
+ return true;
+ }
+
+ virtual inline double approx_unique_element_count() const {
+ // this is not a very good estimate; a better solution should have
+ // some asymptotic behavior as density() approaches 1.0.
+ //
+ // the compress() correction is also bad; it tends to under-estimate.
+ return (double)target_element_count_ * 2.0 * density() * (double)size_list.back() / (double)size_list.front();
+ }
+
+private:
+
+ inline virtual void compute_indices(const bloom_type& hash, std::size_t& bit_index, std::size_t& bit) const
+ {
+ bit_index = hash;
+ for (std::size_t i = 0; i < size_list.size(); ++i)
+ {
+ bit_index %= size_list[i] << 3;
+ }
+ bit = bit_index & 7;
+ }
+
+ std::vector<std::size_t> size_list;
+public:
+ void encode(bufferlist& bl) const;
+ void decode(bufferlist::iterator& bl);
+ void dump(Formatter *f) const;
+ static void generate_test_instances(std::list<compressible_bloom_filter*>& ls);
+};
+WRITE_CLASS_ENCODER(compressible_bloom_filter)
+
+#endif
+
+
+/*
+ Note 1:
+ If it can be guaranteed that bits_per_char will be of the form 2^n then
+ the following optimization can be used:
+
+ hash_table[bit_index >> n] |= bit_mask[bit_index & (bits_per_char - 1)];
+
+ Note 2:
+ For performance reasons where possible when allocating memory it should
+ be aligned (aligned_alloc) according to the architecture being used.
+*/
diff --git a/src/common/buffer.cc b/src/common/buffer.cc
index 24a61724c8e..8da4c106d1b 100644
--- a/src/common/buffer.cc
+++ b/src/common/buffer.cc
@@ -1270,6 +1270,15 @@ int buffer::list::write_fd(int fd) const
return 0;
}
+__u32 buffer::list::crc32c(__u32 crc) const
+{
+ for (std::list<ptr>::const_iterator it = _buffers.begin();
+ it != _buffers.end();
+ ++it)
+ if (it->length())
+ crc = ceph_crc32c(crc, (unsigned char*)it->c_str(), it->length());
+ return crc;
+}
void buffer::list::hexdump(std::ostream &out) const
{
diff --git a/src/common/ceph_argparse.cc b/src/common/ceph_argparse.cc
index 2950a81f89d..6c8053897f3 100644
--- a/src/common/ceph_argparse.cc
+++ b/src/common/ceph_argparse.cc
@@ -464,18 +464,19 @@ CephInitParameters ceph_argparse_early_args
static void generic_usage(bool is_server)
{
cout << "\
- --conf/-c Read configuration from the given configuration file\n\
- --id/-i set ID portion of my name\n\
- --name/-n set name (TYPE.ID)\n\
- --version show version and quit\n\
+ --conf/-c FILE read configuration from the given configuration file\n\
+ --id/-i ID set ID portion of my name\n\
+ --name/-n TYPE.ID set name\n\
+ --cluster NAME set cluster name (default: ceph)\n\
+ --version show version and quit\n\
" << std::endl;
if (is_server) {
cout << "\
- -d Run in foreground, log to stderr.\n\
- -f Run in foreground, log to usual location.\n";
- cout << " --debug_ms N\n";
- cout << " set message debug level (e.g. 1)\n";
+ -d run in foreground, log to stderr.\n\
+ -f run in foreground, log to usual location.\n";
+ cout << "\
+ --debug_ms N set message debug level (e.g. 1)\n";
}
}
diff --git a/src/common/ceph_json.cc b/src/common/ceph_json.cc
index 84355575c6c..a48e0636fcf 100644
--- a/src/common/ceph_json.cc
+++ b/src/common/ceph_json.cc
@@ -222,9 +222,7 @@ bool JSONParser::parse(const char *buf_, int len)
return false;
}
- string json_string = buf_;
- // make a substring to len
- json_string = json_string.substr(0, len);
+ string json_string(buf_, len);
success = read(json_string, data);
if (success)
handle_value(data);
diff --git a/src/common/ceph_strings.cc b/src/common/ceph_strings.cc
index cd08083967a..47648ce19b3 100644
--- a/src/common/ceph_strings.cc
+++ b/src/common/ceph_strings.cc
@@ -50,6 +50,8 @@ const char *ceph_osd_op_name(int op)
case CEPH_OSD_OP_COPY_GET: return "copy-get";
case CEPH_OSD_OP_COPY_FROM: return "copy-from";
+ case CEPH_OSD_OP_UNDIRTY: return "undirty";
+ case CEPH_OSD_OP_ISDIRTY: return "isdirty";
case CEPH_OSD_OP_CLONERANGE: return "clonerange";
case CEPH_OSD_OP_ASSERT_SRC_VERSION: return "assert-src-version";
diff --git a/src/common/config_opts.h b/src/common/config_opts.h
index b0dead48763..700a210b412 100644
--- a/src/common/config_opts.h
+++ b/src/common/config_opts.h
@@ -158,6 +158,8 @@ OPTION(mon_timecheck_interval, OPT_FLOAT, 300.0) // on leader, timecheck (clock
OPTION(mon_accept_timeout, OPT_FLOAT, 10.0) // on leader, if paxos update isn't accepted
OPTION(mon_pg_create_interval, OPT_FLOAT, 30.0) // no more than every 30s
OPTION(mon_pg_stuck_threshold, OPT_INT, 300) // number of seconds after which pgs can be considered inactive, unclean, or stale (see doc/control.rst under dump_stuck for more info)
+OPTION(mon_pg_warn_min_per_osd, OPT_INT, 20) // min # pgs per (in) osd before we warn the admin
+OPTION(mon_pg_warn_max_object_skew, OPT_FLOAT, 10.0) // max skew few average in objects per pg
OPTION(mon_osd_full_ratio, OPT_FLOAT, .95) // what % full makes an OSD "full"
OPTION(mon_osd_nearfull_ratio, OPT_FLOAT, .85) // what % full makes an OSD near full
OPTION(mon_globalid_prealloc, OPT_INT, 100) // how many globalids to prealloc
@@ -360,12 +362,6 @@ OPTION(mds_standby_replay, OPT_BOOL, false)
// If true, compact leveldb store on mount
OPTION(osd_compact_leveldb_on_mount, OPT_BOOL, false)
-// If true, uses tmap as initial value for omap on old objects
-OPTION(osd_auto_upgrade_tmap, OPT_BOOL, true)
-
-// If true, TMAPPUT sets uses_tmap DEBUGGING ONLY
-OPTION(osd_tmapput_sets_uses_tmap, OPT_BOOL, false)
-
// Maximum number of backfills to or from a single osd
OPTION(osd_max_backfills, OPT_U64, 10)
@@ -520,7 +516,7 @@ OPTION(osd_recovery_op_warn_multiple, OPT_U32, 16)
OPTION(osd_mon_shutdown_timeout, OPT_DOUBLE, 5)
OPTION(osd_max_object_size, OPT_U64, 100*1024L*1024L*1024L) // OSD's maximum object size
-OPTION(osd_max_attr_size, OPT_U64, 65536)
+OPTION(osd_max_attr_size, OPT_U64, 0)
OPTION(filestore, OPT_BOOL, false)
@@ -548,12 +544,22 @@ OPTION(filestore_index_retry_probability, OPT_DOUBLE, 0)
OPTION(filestore_debug_inject_read_err, OPT_BOOL, false)
OPTION(filestore_debug_omap_check, OPT_BOOL, 0) // Expensive debugging check on sync
+
// Use omap for xattrs for attrs over
-OPTION(filestore_xattr_use_omap, OPT_BOOL, false)
// filestore_max_inline_xattr_size or
-OPTION(filestore_max_inline_xattr_size, OPT_U32, 512)
+OPTION(filestore_max_inline_xattr_size, OPT_U32, 0) //Override
+OPTION(filestore_max_inline_xattr_size_xfs, OPT_U32, 65536)
+OPTION(filestore_max_inline_xattr_size_btrfs, OPT_U32, 2048)
+OPTION(filestore_max_inline_xattr_size_other, OPT_U32, 512)
+
// for more than filestore_max_inline_xattrs attrs
-OPTION(filestore_max_inline_xattrs, OPT_U32, 2)
+OPTION(filestore_max_inline_xattrs, OPT_U32, 0) //Override
+OPTION(filestore_max_inline_xattrs_xfs, OPT_U32, 10)
+OPTION(filestore_max_inline_xattrs_btrfs, OPT_U32, 10)
+OPTION(filestore_max_inline_xattrs_other, OPT_U32, 2)
+
+OPTION(filestore_sloppy_crc, OPT_BOOL, false) // track sloppy crcs
+OPTION(filestore_sloppy_crc_block_size, OPT_INT, 65536)
OPTION(filestore_max_sync_interval, OPT_DOUBLE, 5) // seconds
OPTION(filestore_min_sync_interval, OPT_DOUBLE, .01) // seconds
diff --git a/src/common/crc32c_intel_fast.c b/src/common/crc32c_intel_fast.c
index 0532dd261cf..42338a7bcd4 100644
--- a/src/common/crc32c_intel_fast.c
+++ b/src/common/crc32c_intel_fast.c
@@ -1,6 +1,5 @@
#include "acconfig.h"
#include "include/int_types.h"
-
#include "common/crc32c_intel_baseline.h"
extern unsigned int crc32_iscsi_00(unsigned char const *buffer, int len, unsigned int crc);
diff --git a/src/common/crc32c_intel_fast.h b/src/common/crc32c_intel_fast.h
index 7a394a0b82c..26a444f6061 100644
--- a/src/common/crc32c_intel_fast.h
+++ b/src/common/crc32c_intel_fast.h
@@ -8,7 +8,7 @@ extern "C" {
/* is the fast version compiled in */
extern int ceph_crc32c_intel_fast_exists(void);
-#ifdef __LP64__
+#ifdef __x86_64__
extern uint32_t ceph_crc32c_intel_fast(uint32_t crc, unsigned char const *buffer, unsigned len);
diff --git a/src/common/hobject.cc b/src/common/hobject.cc
index d6273693c62..b68baedd524 100644
--- a/src/common/hobject.cc
+++ b/src/common/hobject.cc
@@ -191,3 +191,90 @@ ostream& operator<<(ostream& out, const hobject_t& o)
out << "/" << o.nspace << "/" << o.pool;
return out;
}
+
+// This is compatible with decode for hobject_t prior to
+// version 5.
+void ghobject_t::encode(bufferlist& bl) const
+{
+ ENCODE_START(5, 3, bl);
+ ::encode(hobj.key, bl);
+ ::encode(hobj.oid, bl);
+ ::encode(hobj.snap, bl);
+ ::encode(hobj.hash, bl);
+ ::encode(hobj.max, bl);
+ ::encode(hobj.nspace, bl);
+ ::encode(hobj.pool, bl);
+ ::encode(generation, bl);
+ ::encode(shard_id, bl);
+ ENCODE_FINISH(bl);
+}
+
+void ghobject_t::decode(bufferlist::iterator& bl)
+{
+ DECODE_START_LEGACY_COMPAT_LEN(5, 3, 3, bl);
+ if (struct_v >= 1)
+ ::decode(hobj.key, bl);
+ ::decode(hobj.oid, bl);
+ ::decode(hobj.snap, bl);
+ ::decode(hobj.hash, bl);
+ if (struct_v >= 2)
+ ::decode(hobj.max, bl);
+ else
+ hobj.max = false;
+ if (struct_v >= 4) {
+ ::decode(hobj.nspace, bl);
+ ::decode(hobj.pool, bl);
+ }
+ if (struct_v >= 5) {
+ ::decode(generation, bl);
+ ::decode(shard_id, bl);
+ } else {
+ generation = ghobject_t::NO_GEN;
+ shard_id = ghobject_t::NO_SHARD;
+ }
+ DECODE_FINISH(bl);
+}
+
+void ghobject_t::dump(Formatter *f) const
+{
+ hobj.dump(f);
+ if (generation != NO_GEN) {
+ f->dump_int("generation", generation);
+ f->dump_int("shard_id", shard_id);
+ }
+}
+
+void ghobject_t::generate_test_instances(list<ghobject_t*>& o)
+{
+ o.push_back(new ghobject_t);
+ o.push_back(new ghobject_t);
+ o.back()->hobj.max = true;
+ o.push_back(new ghobject_t(hobject_t(object_t("oname"), string(), 1, 234, -1, "")));
+
+ o.push_back(new ghobject_t(hobject_t(object_t("oname2"), string("okey"), CEPH_NOSNAP,
+ 67, 0, "n1"), 1, 0));
+ o.push_back(new ghobject_t(hobject_t(object_t("oname2"), string("okey"), CEPH_NOSNAP,
+ 67, 0, "n1"), 1, 1));
+ o.push_back(new ghobject_t(hobject_t(object_t("oname2"), string("okey"), CEPH_NOSNAP,
+ 67, 0, "n1"), 1, 2));
+ o.push_back(new ghobject_t(hobject_t(object_t("oname3"), string("oname3"),
+ CEPH_SNAPDIR, 910, 1, "n2"), 1, 0));
+ o.push_back(new ghobject_t(hobject_t(object_t("oname3"), string("oname3"),
+ CEPH_SNAPDIR, 910, 1, "n2"), 2, 0));
+ o.push_back(new ghobject_t(hobject_t(object_t("oname3"), string("oname3"),
+ CEPH_SNAPDIR, 910, 1, "n2"), 3, 0));
+ o.push_back(new ghobject_t(hobject_t(object_t("oname3"), string("oname3"),
+ CEPH_SNAPDIR, 910, 1, "n2"), 3, 1));
+ o.push_back(new ghobject_t(hobject_t(object_t("oname3"), string("oname3"),
+ CEPH_SNAPDIR, 910, 1, "n2"), 3, 2));
+}
+
+ostream& operator<<(ostream& out, const ghobject_t& o)
+{
+ out << o.hobj;
+ if (o.generation != ghobject_t::NO_GEN) {
+ assert(o.shard_id != ghobject_t::NO_SHARD);
+ out << "/" << o.generation << "/" << o.shard_id;
+ }
+ return out;
+}
diff --git a/src/common/hobject.h b/src/common/hobject.h
index 633e471dffc..a769ad060d9 100644
--- a/src/common/hobject.h
+++ b/src/common/hobject.h
@@ -79,6 +79,30 @@ public:
return ret;
}
+ /// @return head version of this hobject_t
+ hobject_t get_head() const {
+ hobject_t ret(*this);
+ ret.snap = CEPH_NOSNAP;
+ return ret;
+ }
+
+ /// @return snapdir version of this hobject_t
+ hobject_t get_snapdir() const {
+ hobject_t ret(*this);
+ ret.snap = CEPH_SNAPDIR;
+ return ret;
+ }
+
+ /// @return true if object is neither head nor snapdir
+ bool is_snap() const {
+ return (snap != CEPH_NOSNAP) && (snap != CEPH_SNAPDIR);
+ }
+
+ /// @return true iff the object should have a snapset in it's attrs
+ bool has_snapset() const {
+ return !is_snap();
+ }
+
/* Do not use when a particular hash function is needed */
explicit hobject_t(const sobject_t &o) :
oid(o.oid), snap(o.snap), max(false), pool(-1) {
@@ -138,7 +162,7 @@ public:
(*this) = temp;
}
- string get_namespace() const {
+ const string &get_namespace() const {
return nspace;
}
@@ -153,6 +177,7 @@ public:
friend bool operator>=(const hobject_t&, const hobject_t&);
friend bool operator==(const hobject_t&, const hobject_t&);
friend bool operator!=(const hobject_t&, const hobject_t&);
+ friend class ghobject_t;
};
WRITE_CLASS_ENCODER(hobject_t)
@@ -179,4 +204,102 @@ WRITE_CMP_OPERATORS_7(hobject_t,
oid,
snap)
+typedef uint64_t gen_t;
+typedef uint8_t shard_t;
+
+#ifndef UINT8_MAX
+#define UINT8_MAX (255)
+#endif
+#ifndef UINT64_MAX
+#define UINT64_MAX (18446744073709551615ULL)
+#endif
+
+struct ghobject_t {
+ hobject_t hobj;
+ gen_t generation;
+ shard_t shard_id;
+
+public:
+ static const shard_t NO_SHARD = UINT8_MAX;
+ static const gen_t NO_GEN = UINT64_MAX;
+
+ ghobject_t() : generation(NO_GEN), shard_id(NO_SHARD) {}
+
+ ghobject_t(const hobject_t &obj) : hobj(obj), generation(NO_GEN), shard_id(NO_SHARD) {}
+
+ ghobject_t(const hobject_t &obj, gen_t gen, shard_t shard) : hobj(obj), generation(gen), shard_id(shard) {}
+
+ bool match(uint32_t bits, uint32_t match) const {
+ return hobj.match_hash(hobj.hash, bits, match);
+ }
+ /// @return min ghobject_t ret s.t. ret.hash == this->hash
+ ghobject_t get_boundary() const {
+ if (hobj.is_max())
+ return *this;
+ ghobject_t ret;
+ ret.hobj.hash = hobj.hash;
+ return ret;
+ }
+ filestore_hobject_key_t get_filestore_key_u32() const {
+ return hobj.get_filestore_key_u32();
+ }
+ filestore_hobject_key_t get_filestore_key() const {
+ return hobj.get_filestore_key();
+ }
+
+ bool is_degenerate() const {
+ return generation == NO_GEN && shard_id == NO_SHARD;
+ }
+
+ // maximum sorted value.
+ static ghobject_t get_max() {
+ ghobject_t h(hobject_t::get_max());
+ return h;
+ }
+ bool is_max() const {
+ return hobj.is_max();
+ }
+
+ void swap(ghobject_t &o) {
+ ghobject_t temp(o);
+ o = (*this);
+ (*this) = temp;
+ }
+
+ void encode(bufferlist& bl) const;
+ void decode(bufferlist::iterator& bl);
+ void decode(json_spirit::Value& v);
+ void dump(Formatter *f) const;
+ static void generate_test_instances(list<ghobject_t*>& o);
+ friend bool operator<(const ghobject_t&, const ghobject_t&);
+ friend bool operator>(const ghobject_t&, const ghobject_t&);
+ friend bool operator<=(const ghobject_t&, const ghobject_t&);
+ friend bool operator>=(const ghobject_t&, const ghobject_t&);
+ friend bool operator==(const ghobject_t&, const ghobject_t&);
+ friend bool operator!=(const ghobject_t&, const ghobject_t&);
+};
+WRITE_CLASS_ENCODER(ghobject_t)
+
+namespace __gnu_cxx {
+ template<> struct hash<ghobject_t> {
+ size_t operator()(const ghobject_t &r) const {
+ static hash<object_t> H;
+ static rjhash<uint64_t> I;
+ return H(r.hobj.oid) ^ I(r.hobj.snap);
+ }
+ };
+}
+
+ostream& operator<<(ostream& out, const ghobject_t& o);
+
+WRITE_EQ_OPERATORS_3(ghobject_t, hobj, shard_id, generation)
+// sort ghobject_t's by <hobj, shard_id, generation>
+//
+// Two objects which differ by generation are more related than
+// two objects of the same generation which differ by shard.
+//
+WRITE_CMP_OPERATORS_3(ghobject_t,
+ hobj,
+ shard_id,
+ generation)
#endif
diff --git a/src/common/safe_io.c b/src/common/safe_io.c
index ac99db04ad3..afee82edf07 100644
--- a/src/common/safe_io.c
+++ b/src/common/safe_io.c
@@ -14,8 +14,12 @@
#define _XOPEN_SOURCE 500
+#include <stdio.h>
+#include <string.h>
#include <unistd.h>
#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
#include "common/safe_io.h"
@@ -112,3 +116,79 @@ ssize_t safe_pwrite(int fd, const void *buf, size_t count, off_t offset)
}
return 0;
}
+
+int safe_write_file(const char *base, const char *file,
+ const char *val, size_t vallen)
+{
+ int ret;
+ char fn[PATH_MAX];
+ char tmp[PATH_MAX];
+ int fd;
+
+ // does the file already have correct content?
+ char oldval[80];
+ ret = safe_read_file(base, file, oldval, sizeof(oldval));
+ if (ret == (int)vallen && memcmp(oldval, val, vallen) == 0)
+ return 0; // yes.
+
+ snprintf(fn, sizeof(fn), "%s/%s", base, file);
+ snprintf(tmp, sizeof(tmp), "%s/%s.tmp", base, file);
+ fd = open(tmp, O_WRONLY|O_CREAT|O_TRUNC, 0644);
+ if (fd < 0) {
+ ret = errno;
+ return -ret;
+ }
+ ret = safe_write(fd, val, vallen);
+ if (ret) {
+ TEMP_FAILURE_RETRY(close(fd));
+ return ret;
+ }
+
+ ret = fsync(fd);
+ if (ret < 0) ret = -errno;
+ TEMP_FAILURE_RETRY(close(fd));
+ if (ret < 0) {
+ unlink(tmp);
+ return ret;
+ }
+ ret = rename(tmp, fn);
+ if (ret < 0) {
+ ret = -errno;
+ unlink(tmp);
+ return ret;
+ }
+
+ fd = open(base, O_RDONLY);
+ if (fd < 0) {
+ ret = -errno;
+ return ret;
+ }
+ ret = fsync(fd);
+ if (ret < 0) ret = -errno;
+ TEMP_FAILURE_RETRY(close(fd));
+
+ return ret;
+}
+
+int safe_read_file(const char *base, const char *file,
+ char *val, size_t vallen)
+{
+ char fn[PATH_MAX];
+ int fd, len;
+
+ snprintf(fn, sizeof(fn), "%s/%s", base, file);
+ fd = open(fn, O_RDONLY);
+ if (fd < 0) {
+ return -errno;
+ }
+ len = safe_read(fd, val, vallen - 1);
+ if (len < 0) {
+ TEMP_FAILURE_RETRY(close(fd));
+ return len;
+ }
+ // close sometimes returns errors, but only after write()
+ TEMP_FAILURE_RETRY(close(fd));
+
+ val[len] = 0;
+ return len;
+}
diff --git a/src/common/safe_io.h b/src/common/safe_io.h
index 4c2991fe6e8..a4c9bc7a72f 100644
--- a/src/common/safe_io.h
+++ b/src/common/safe_io.h
@@ -45,6 +45,15 @@ extern "C" {
ssize_t safe_pread_exact(int fd, void *buf, size_t count, off_t offset)
WARN_UNUSED_RESULT;
+
+ /*
+ * Safe functions to read and write an entire file.
+ */
+ int safe_write_file(const char *base, const char *file,
+ const char *val, size_t vallen);
+ int safe_read_file(const char *base, const char *file,
+ char *val, size_t vallen);
+
#ifdef __cplusplus
}
#endif
diff --git a/src/common/util.cc b/src/common/util.cc
index 6da37e88833..ab417befef6 100644
--- a/src/common/util.cc
+++ b/src/common/util.cc
@@ -58,6 +58,7 @@ int64_t unit_to_bytesize(string val, ostream *pss)
switch (c) {
case 'B':
break;
+ case 'k':
case 'K':
modifier = 10;
break;
diff --git a/src/crush/CrushWrapper.cc b/src/crush/CrushWrapper.cc
index bab9f9a817e..d17166bc4a9 100644
--- a/src/crush/CrushWrapper.cc
+++ b/src/crush/CrushWrapper.cc
@@ -861,7 +861,6 @@ void CrushWrapper::decode(bufferlist::iterator& blp)
decode_32_or_64_string_map(type_map, blp);
decode_32_or_64_string_map(name_map, blp);
decode_32_or_64_string_map(rule_name_map, blp);
- build_rmaps();
// tunables
if (!blp.end()) {
diff --git a/src/crush/CrushWrapper.h b/src/crush/CrushWrapper.h
index 80906e4fe18..b4bb67bb742 100644
--- a/src/crush/CrushWrapper.h
+++ b/src/crush/CrushWrapper.h
@@ -94,6 +94,7 @@ public:
crush_destroy(crush);
crush = crush_create();
assert(crush);
+ have_rmaps = false;
}
// tunables
diff --git a/src/global/signal_handler.cc b/src/global/signal_handler.cc
index ce604fe1e5d..ffdc5402caf 100644
--- a/src/global/signal_handler.cc
+++ b/src/global/signal_handler.cc
@@ -196,13 +196,13 @@ struct SignalHandler : public Thread {
lock.Lock();
int num_fds = 0;
fds[num_fds].fd = pipefd[0];
- fds[num_fds].events = POLLIN | POLLOUT | POLLERR;
+ fds[num_fds].events = POLLIN | POLLERR;
fds[num_fds].revents = 0;
++num_fds;
for (unsigned i=0; i<32; i++) {
if (handlers[i]) {
fds[num_fds].fd = handlers[i]->pipefd[0];
- fds[num_fds].events = POLLIN | POLLOUT | POLLERR;
+ fds[num_fds].events = POLLIN | POLLERR;
fds[num_fds].revents = 0;
++num_fds;
}
diff --git a/src/include/CompatSet.h b/src/include/CompatSet.h
index 26c438c05f2..b23883093ac 100644
--- a/src/include/CompatSet.h
+++ b/src/include/CompatSet.h
@@ -36,8 +36,8 @@ struct CompatSet {
FeatureSet() : mask(1), names() {}
void insert(Feature f) {
assert(f.id > 0);
- assert(f.id < 63);
- mask |= (1<<f.id);
+ assert(f.id < 64);
+ mask |= ((uint64_t)1<<f.id);
names[f.id] = f.name;
}
@@ -50,7 +50,7 @@ struct CompatSet {
void remove(uint64_t f) {
if (names.count(f)) {
names.erase(f);
- mask &= ~(1<<f);
+ mask &= ~((uint64_t)1<<f);
}
}
void remove(Feature f) {
@@ -156,24 +156,48 @@ struct CompatSet {
((other.ro_compat.mask ^ ro_compat.mask) & other.ro_compat.mask);
uint64_t other_incompat =
((other.incompat.mask ^ incompat.mask) & other.incompat.mask);
- for (int i = 0; i < 64; ++i) {
- int mask = 1 << i;
+ for (int id = 1; id < 64; ++id) {
+ uint64_t mask = (uint64_t)1 << id;
if (mask & other_compat) {
- diff.compat.insert( Feature(mask & other_compat,
- other.compat.names[mask&other_compat]));
+ diff.compat.insert( Feature(id, other.compat.names[id]));
}
if (mask & other_ro_compat) {
- diff.ro_compat.insert(Feature(mask & other_ro_compat,
- other.compat.names[mask&other_ro_compat]));
+ diff.ro_compat.insert(Feature(id, other.ro_compat.names[id]));
}
if (mask & other_incompat) {
- diff.incompat.insert( Feature(mask & other_incompat,
- other.incompat.names[mask&other_incompat]));
+ diff.incompat.insert( Feature(id, other.incompat.names[id]));
}
}
return diff;
}
+ /* Merge features supported by other CompatSet into this one.
+ * Return: true if some features were merged
+ */
+ bool merge(CompatSet& other) {
+ uint64_t other_compat =
+ ((other.compat.mask ^ compat.mask) & other.compat.mask);
+ uint64_t other_ro_compat =
+ ((other.ro_compat.mask ^ ro_compat.mask) & other.ro_compat.mask);
+ uint64_t other_incompat =
+ ((other.incompat.mask ^ incompat.mask) & other.incompat.mask);
+ if (!other_compat && !other_ro_compat && !other_incompat)
+ return false;
+ for (int id = 1; id < 64; ++id) {
+ uint64_t mask = (uint64_t)1 << id;
+ if (mask & other_compat) {
+ compat.insert( Feature(id, other.compat.names[id]));
+ }
+ if (mask & other_ro_compat) {
+ ro_compat.insert(Feature(id, other.ro_compat.names[id]));
+ }
+ if (mask & other_incompat) {
+ incompat.insert( Feature(id, other.incompat.names[id]));
+ }
+ }
+ return true;
+ }
+
void encode(bufferlist& bl) const {
compat.encode(bl);
ro_compat.encode(bl);
diff --git a/src/include/Context.h b/src/include/Context.h
index 9ec4414a047..663313ceec1 100644
--- a/src/include/Context.h
+++ b/src/include/Context.h
@@ -28,6 +28,26 @@
#define mydout(cct, v) lgeneric_subdout(cct, context, v)
/*
+ * GenContext - abstract callback class
+ */
+template <typename T>
+class GenContext {
+ GenContext(const GenContext& other);
+ const GenContext& operator=(const GenContext& other);
+
+ protected:
+ virtual void finish(T t) = 0;
+
+ public:
+ GenContext() {}
+ virtual ~GenContext() {} // we want a virtual destructor!!!
+ virtual void complete(T t) {
+ finish(t);
+ delete this;
+ }
+};
+
+/*
* Context - abstract callback class
*/
class Context {
diff --git a/src/include/Makefile.am b/src/include/Makefile.am
index d702ebd2795..c8823ce523d 100644
--- a/src/include/Makefile.am
+++ b/src/include/Makefile.am
@@ -18,7 +18,6 @@ rados_include_DATA = \
$(srcdir)/include/crc32c.h
noinst_HEADERS += \
- include/bloom_filter.hpp \
include/Context.h \
include/CompatSet.h \
include/Distribution.h \
@@ -44,6 +43,7 @@ noinst_HEADERS += \
include/filepath.h \
include/frag.h \
include/hash.h \
+ include/histogram.h \
include/intarith.h \
include/interval_set.h \
include/int_types.h \
diff --git a/src/include/bloom_filter.hpp b/src/include/bloom_filter.hpp
deleted file mode 100644
index 41aba4bad47..00000000000
--- a/src/include/bloom_filter.hpp
+++ /dev/null
@@ -1,544 +0,0 @@
-/*
- *******************************************************************
- * *
- * Open Bloom Filter *
- * *
- * Author: Arash Partow - 2000 *
- * URL: http://www.partow.net/programming/hashfunctions/index.html *
- * *
- * Copyright notice: *
- * Free use of the Open Bloom Filter Library is permitted under *
- * the guidelines and in accordance with the most current version *
- * of the Boost Software License, Version 1.0 *
- * http://www.opensource.org/licenses/bsl1.0.html *
- * *
- *******************************************************************
-*/
-
-
-#ifndef INCLUDE_BLOOM_FILTER_HPP
-#define INCLUDE_BLOOM_FILTER_HPP
-
-#include <cstddef>
-#include <algorithm>
-#include <cmath>
-#include <limits>
-#include <string>
-#include <vector>
-
-
-static const std::size_t bits_per_char = 0x08; // 8 bits in 1 char(unsigned)
-static const unsigned char bit_mask[bits_per_char] = {
- 0x01, //00000001
- 0x02, //00000010
- 0x04, //00000100
- 0x08, //00001000
- 0x10, //00010000
- 0x20, //00100000
- 0x40, //01000000
- 0x80 //10000000
- };
-
-
-class bloom_filter
-{
-protected:
-
- typedef unsigned int bloom_type;
- typedef unsigned char cell_type;
-
-public:
-
- bloom_filter(const std::size_t& predicted_inserted_element_count,
- const double& false_positive_probability,
- const std::size_t& random_seed)
- : bit_table_(0),
- predicted_inserted_element_count_(predicted_inserted_element_count),
- inserted_element_count_(0),
- random_seed_((random_seed) ? random_seed : 0xA5A5A5A5),
- desired_false_positive_probability_(false_positive_probability)
- {
- find_optimal_parameters();
- generate_unique_salt();
- raw_table_size_ = table_size_ / bits_per_char;
- bit_table_ = new cell_type[raw_table_size_];
- std::fill_n(bit_table_,raw_table_size_,0x00);
- }
-
- bloom_filter(const bloom_filter& filter)
- {
- this->operator=(filter);
- }
-
- bloom_filter& operator = (const bloom_filter& filter)
- {
- if (this != &filter) {
- salt_count_ = filter.salt_count_;
- table_size_ = filter.table_size_;
- raw_table_size_ = filter.raw_table_size_;
- predicted_inserted_element_count_ = filter.predicted_inserted_element_count_;
- inserted_element_count_ = filter.inserted_element_count_;
- random_seed_ = filter.random_seed_;
- desired_false_positive_probability_ = filter.desired_false_positive_probability_;
- delete[] bit_table_;
- bit_table_ = new cell_type[raw_table_size_];
- std::copy(filter.bit_table_,filter.bit_table_ + raw_table_size_,bit_table_);
- salt_ = filter.salt_;
- }
- return *this;
- }
-
- virtual ~bloom_filter()
- {
- delete[] bit_table_;
- }
-
- inline bool operator!() const
- {
- return (0 == table_size_);
- }
-
- inline void clear()
- {
- std::fill_n(bit_table_,raw_table_size_,0x00);
- inserted_element_count_ = 0;
- }
-
- inline void insert(const unsigned char* key_begin, const std::size_t& length)
- {
- std::size_t bit_index = 0;
- std::size_t bit = 0;
- for (std::size_t i = 0; i < salt_.size(); ++i)
- {
- compute_indices(hash_ap(key_begin,length,salt_[i]),bit_index,bit);
- bit_table_[bit_index / bits_per_char] |= bit_mask[bit];
- }
- ++inserted_element_count_;
- }
-
- template<typename T>
- inline void insert(const T& t)
- {
- // Note: T must be a C++ POD type.
- insert(reinterpret_cast<const unsigned char*>(&t),sizeof(T));
- }
-
- inline void insert(const std::string& key)
- {
- insert(reinterpret_cast<const unsigned char*>(key.c_str()),key.size());
- }
-
- inline void insert(const char* data, const std::size_t& length)
- {
- insert(reinterpret_cast<const unsigned char*>(data),length);
- }
-
- template<typename InputIterator>
- inline void insert(const InputIterator begin, const InputIterator end)
- {
- InputIterator itr = begin;
- while (end != itr)
- {
- insert(*(itr++));
- }
- }
-
- inline virtual bool contains(const unsigned char* key_begin, const std::size_t length) const
- {
- std::size_t bit_index = 0;
- std::size_t bit = 0;
- for (std::size_t i = 0; i < salt_.size(); ++i)
- {
- compute_indices(hash_ap(key_begin,length,salt_[i]),bit_index,bit);
- if ((bit_table_[bit_index / bits_per_char] & bit_mask[bit]) != bit_mask[bit])
- {
- return false;
- }
- }
- return true;
- }
-
- template<typename T>
- inline bool contains(const T& t) const
- {
- return contains(reinterpret_cast<const unsigned char*>(&t),static_cast<std::size_t>(sizeof(T)));
- }
-
- inline bool contains(const std::string& key) const
- {
- return contains(reinterpret_cast<const unsigned char*>(key.c_str()),key.size());
- }
-
- inline bool contains(const char* data, const std::size_t& length) const
- {
- return contains(reinterpret_cast<const unsigned char*>(data),length);
- }
-
- template<typename InputIterator>
- inline InputIterator contains_all(const InputIterator begin, const InputIterator end) const
- {
- InputIterator itr = begin;
- while (end != itr)
- {
- if (!contains(*itr))
- {
- return itr;
- }
- ++itr;
- }
- return end;
- }
-
- template<typename InputIterator>
- inline InputIterator contains_none(const InputIterator begin, const InputIterator end) const
- {
- InputIterator itr = begin;
- while (end != itr)
- {
- if (contains(*itr))
- {
- return itr;
- }
- ++itr;
- }
- return end;
- }
-
- inline virtual std::size_t size() const
- {
- return table_size_;
- }
-
- inline std::size_t element_count() const
- {
- return inserted_element_count_;
- }
-
- inline double effective_fpp() const
- {
- /*
- Note:
- The effective false positive probability is calculated using the
- designated table size and hash function count in conjunction with
- the current number of inserted elements - not the user defined
- predicated/expected number of inserted elements.
- */
- return std::pow(1.0 - std::exp(-1.0 * salt_.size() * inserted_element_count_ / size()), 1.0 * salt_.size());
- }
-
- inline bloom_filter& operator &= (const bloom_filter& filter)
- {
- /* intersection */
- if (
- (salt_count_ == filter.salt_count_) &&
- (table_size_ == filter.table_size_) &&
- (random_seed_ == filter.random_seed_)
- )
- {
- for (std::size_t i = 0; i < raw_table_size_; ++i)
- {
- bit_table_[i] &= filter.bit_table_[i];
- }
- }
- return *this;
- }
-
- inline bloom_filter& operator |= (const bloom_filter& filter)
- {
- /* union */
- if (
- (salt_count_ == filter.salt_count_) &&
- (table_size_ == filter.table_size_) &&
- (random_seed_ == filter.random_seed_)
- )
- {
- for (std::size_t i = 0; i < raw_table_size_; ++i)
- {
- bit_table_[i] |= filter.bit_table_[i];
- }
- }
- return *this;
- }
-
- inline bloom_filter& operator ^= (const bloom_filter& filter)
- {
- /* difference */
- if (
- (salt_count_ == filter.salt_count_) &&
- (table_size_ == filter.table_size_) &&
- (random_seed_ == filter.random_seed_)
- )
- {
- for (std::size_t i = 0; i < raw_table_size_; ++i)
- {
- bit_table_[i] ^= filter.bit_table_[i];
- }
- }
- return *this;
- }
-
- inline const cell_type* table() const
- {
- return bit_table_;
- }
-
-protected:
-
- inline virtual void compute_indices(const bloom_type& hash, std::size_t& bit_index, std::size_t& bit) const
- {
- bit_index = hash % table_size_;
- bit = bit_index % bits_per_char;
- }
-
- void generate_unique_salt()
- {
- /*
- Note:
- A distinct hash function need not be implementation-wise
- distinct. In the current implementation "seeding" a common
- hash function with different values seems to be adequate.
- */
- const unsigned int predef_salt_count = 128;
- static const bloom_type predef_salt[predef_salt_count] =
- {
- 0xAAAAAAAA, 0x55555555, 0x33333333, 0xCCCCCCCC,
- 0x66666666, 0x99999999, 0xB5B5B5B5, 0x4B4B4B4B,
- 0xAA55AA55, 0x55335533, 0x33CC33CC, 0xCC66CC66,
- 0x66996699, 0x99B599B5, 0xB54BB54B, 0x4BAA4BAA,
- 0xAA33AA33, 0x55CC55CC, 0x33663366, 0xCC99CC99,
- 0x66B566B5, 0x994B994B, 0xB5AAB5AA, 0xAAAAAA33,
- 0x555555CC, 0x33333366, 0xCCCCCC99, 0x666666B5,
- 0x9999994B, 0xB5B5B5AA, 0xFFFFFFFF, 0xFFFF0000,
- 0xB823D5EB, 0xC1191CDF, 0xF623AEB3, 0xDB58499F,
- 0xC8D42E70, 0xB173F616, 0xA91A5967, 0xDA427D63,
- 0xB1E8A2EA, 0xF6C0D155, 0x4909FEA3, 0xA68CC6A7,
- 0xC395E782, 0xA26057EB, 0x0CD5DA28, 0x467C5492,
- 0xF15E6982, 0x61C6FAD3, 0x9615E352, 0x6E9E355A,
- 0x689B563E, 0x0C9831A8, 0x6753C18B, 0xA622689B,
- 0x8CA63C47, 0x42CC2884, 0x8E89919B, 0x6EDBD7D3,
- 0x15B6796C, 0x1D6FDFE4, 0x63FF9092, 0xE7401432,
- 0xEFFE9412, 0xAEAEDF79, 0x9F245A31, 0x83C136FC,
- 0xC3DA4A8C, 0xA5112C8C, 0x5271F491, 0x9A948DAB,
- 0xCEE59A8D, 0xB5F525AB, 0x59D13217, 0x24E7C331,
- 0x697C2103, 0x84B0A460, 0x86156DA9, 0xAEF2AC68,
- 0x23243DA5, 0x3F649643, 0x5FA495A8, 0x67710DF8,
- 0x9A6C499E, 0xDCFB0227, 0x46A43433, 0x1832B07A,
- 0xC46AFF3C, 0xB9C8FFF0, 0xC9500467, 0x34431BDF,
- 0xB652432B, 0xE367F12B, 0x427F4C1B, 0x224C006E,
- 0x2E7E5A89, 0x96F99AA5, 0x0BEB452A, 0x2FD87C39,
- 0x74B2E1FB, 0x222EFD24, 0xF357F60C, 0x440FCB1E,
- 0x8BBE030F, 0x6704DC29, 0x1144D12F, 0x948B1355,
- 0x6D8FD7E9, 0x1C11A014, 0xADD1592F, 0xFB3C712E,
- 0xFC77642F, 0xF9C4CE8C, 0x31312FB9, 0x08B0DD79,
- 0x318FA6E7, 0xC040D23D, 0xC0589AA7, 0x0CA5C075,
- 0xF874B172, 0x0CF914D5, 0x784D3280, 0x4E8CFEBC,
- 0xC569F575, 0xCDB2A091, 0x2CC016B4, 0x5C5F4421
- };
-
- if (salt_count_ <= predef_salt_count)
- {
- std::copy(predef_salt,
- predef_salt + salt_count_,
- std::back_inserter(salt_));
- for (unsigned int i = 0; i < salt_.size(); ++i)
- {
- /*
- Note:
- This is done to integrate the user defined random seed,
- so as to allow for the generation of unique bloom filter
- instances.
- */
- salt_[i] = salt_[i] * salt_[(i + 3) % salt_.size()] + random_seed_;
- }
- }
- else
- {
- std::copy(predef_salt,predef_salt + predef_salt_count,std::back_inserter(salt_));
- srand(static_cast<unsigned int>(random_seed_));
- while (salt_.size() < salt_count_)
- {
- bloom_type current_salt = static_cast<bloom_type>(rand()) * static_cast<bloom_type>(rand());
- if (0 == current_salt) continue;
- if (salt_.end() == std::find(salt_.begin(), salt_.end(), current_salt))
- {
- salt_.push_back(current_salt);
- }
- }
- }
- }
-
- void find_optimal_parameters()
- {
- /*
- Note:
- The following will attempt to find the number of hash functions
- and minimum amount of storage bits required to construct a bloom
- filter consistent with the user defined false positive probability
- and estimated element insertion count.
- */
-
- double min_m = std::numeric_limits<double>::infinity();
- double min_k = 0.0;
- double curr_m = 0.0;
- double k = 1.0;
- while (k < 1000.0)
- {
- double numerator = (- k * predicted_inserted_element_count_);
- double denominator = std::log(1.0 - std::pow(desired_false_positive_probability_, 1.0 / k));
- curr_m = numerator / denominator;
-
- if (curr_m < min_m)
- {
- min_m = curr_m;
- min_k = k;
- }
- k += 1.0;
- }
-
- salt_count_ = static_cast<std::size_t>(min_k);
- table_size_ = static_cast<std::size_t>(min_m);
- table_size_ += (((table_size_ % bits_per_char) != 0) ? (bits_per_char - (table_size_ % bits_per_char)) : 0);
- }
-
- inline bloom_type hash_ap(const unsigned char* begin, std::size_t remaining_length, bloom_type hash) const
- {
- const unsigned char* itr = begin;
-
- while (remaining_length >= 4)
- {
- hash ^= (hash << 7) ^ (*itr++) * (hash >> 3);
- hash ^= (~((hash << 11) + ((*itr++) ^ (hash >> 5))));
- hash ^= (hash << 7) ^ (*itr++) * (hash >> 3);
- hash ^= (~((hash << 11) + ((*itr++) ^ (hash >> 5))));
- remaining_length -= 4;
- }
-
- while (remaining_length >= 2)
- {
- hash ^= (hash << 7) ^ (*itr++) * (hash >> 3);
- hash ^= (~((hash << 11) + ((*itr++) ^ (hash >> 5))));
- remaining_length -= 2;
- }
-
- if (remaining_length)
- {
- hash ^= (hash << 7) ^ (*itr) * (hash >> 3);
- }
-
- return hash;
- }
-
- std::vector<bloom_type> salt_;
- unsigned char* bit_table_;
- std::size_t salt_count_;
- std::size_t table_size_;
- std::size_t raw_table_size_;
- std::size_t predicted_inserted_element_count_;
- std::size_t inserted_element_count_;
- std::size_t random_seed_;
- double desired_false_positive_probability_;
-};
-
-inline bloom_filter operator & (const bloom_filter& a, const bloom_filter& b)
-{
- bloom_filter result = a;
- result &= b;
- return result;
-}
-
-inline bloom_filter operator | (const bloom_filter& a, const bloom_filter& b)
-{
- bloom_filter result = a;
- result |= b;
- return result;
-}
-
-inline bloom_filter operator ^ (const bloom_filter& a, const bloom_filter& b)
-{
- bloom_filter result = a;
- result ^= b;
- return result;
-}
-
-
-class compressible_bloom_filter : public bloom_filter
-{
-public:
-
- compressible_bloom_filter(const std::size_t& predicted_element_count,
- const double& false_positive_probability,
- const std::size_t& random_seed)
- : bloom_filter(predicted_element_count,false_positive_probability,random_seed)
- {
- size_list.push_back(table_size_);
- }
-
- inline virtual std::size_t size() const
- {
- return size_list.back();
- }
-
- inline bool compress(const double& percentage)
- {
- if ((0.0 >= percentage) || (percentage >= 100.0))
- {
- return false;
- }
-
- std::size_t original_table_size = size_list.back();
- std::size_t new_table_size = static_cast<std::size_t>((size_list.back() * (1.0 - (percentage / 100.0))));
- new_table_size -= (((new_table_size % bits_per_char) != 0) ? (new_table_size % bits_per_char) : 0);
-
- if ((bits_per_char > new_table_size) || (new_table_size >= original_table_size))
- {
- return false;
- }
-
- desired_false_positive_probability_ = effective_fpp();
- cell_type* tmp = new cell_type[new_table_size / bits_per_char];
- std::copy(bit_table_, bit_table_ + (new_table_size / bits_per_char), tmp);
- cell_type* itr = bit_table_ + (new_table_size / bits_per_char);
- cell_type* end = bit_table_ + (original_table_size / bits_per_char);
- cell_type* itr_tmp = tmp;
-
- while (end != itr)
- {
- *(itr_tmp++) |= (*itr++);
- }
-
- delete[] bit_table_;
- bit_table_ = tmp;
- size_list.push_back(new_table_size);
-
- return true;
- }
-
-private:
-
- inline virtual void compute_indices(const bloom_type& hash, std::size_t& bit_index, std::size_t& bit) const
- {
- bit_index = hash;
- for (std::size_t i = 0; i < size_list.size(); ++i)
- {
- bit_index %= size_list[i];
- }
- bit = bit_index % bits_per_char;
- }
-
- std::vector<std::size_t> size_list;
-};
-
-#endif
-
-
-/*
- Note 1:
- If it can be guaranteed that bits_per_char will be of the form 2^n then
- the following optimization can be used:
-
- hash_table[bit_index >> n] |= bit_mask[bit_index & (bits_per_char - 1)];
-
- Note 2:
- For performance reasons where possible when allocating memory it should
- be aligned (aligned_alloc) according to the architecture being used.
-*/
diff --git a/src/include/buffer.h b/src/include/buffer.h
index 077cf0d9b0b..ffa3d6e1b97 100644
--- a/src/include/buffer.h
+++ b/src/include/buffer.h
@@ -14,8 +14,6 @@
#ifndef CEPH_BUFFER_H
#define CEPH_BUFFER_H
-#include "include/int_types.h"
-
#if defined(__linux__)
#include <stdlib.h>
#include <linux/types.h>
@@ -46,6 +44,7 @@ void *valloc(size_t);
#include <malloc.h>
#endif
+#include <inttypes.h>
#include <stdint.h>
#include <string.h>
@@ -420,15 +419,7 @@ public:
ssize_t read_fd(int fd, size_t len);
int write_file(const char *fn, int mode=0644);
int write_fd(int fd) const;
- __u32 crc32c(__u32 crc) {
- for (std::list<ptr>::const_iterator it = _buffers.begin();
- it != _buffers.end();
- ++it)
- if (it->length())
- crc = ceph_crc32c(crc, (unsigned char*)it->c_str(), it->length());
- return crc;
- }
-
+ uint32_t crc32c(uint32_t crc) const;
};
/*
@@ -436,7 +427,7 @@ public:
*/
class hash {
- __u32 crc;
+ uint32_t crc;
public:
hash() : crc(0) { }
@@ -445,7 +436,7 @@ public:
crc = bl.crc32c(crc);
}
- __u32 digest() {
+ uint32_t digest() {
return crc;
}
};
diff --git a/src/include/ceph_fs.h b/src/include/ceph_fs.h
index 6c41d14f5da..ba0b5eb0f19 100644
--- a/src/include/ceph_fs.h
+++ b/src/include/ceph_fs.h
@@ -224,6 +224,7 @@ struct ceph_mon_subscribe_ack {
* mdsmap flags
*/
#define CEPH_MDSMAP_DOWN (1<<0) /* cluster deliberately down */
+#define CEPH_MDSMAP_ALLOW_SNAPS (1<<1) /* cluster allowed to create snapshots */
/*
* mds states
diff --git a/src/include/crc32c.h b/src/include/crc32c.h
index 8e22c624636..49d68474d68 100644
--- a/src/include/crc32c.h
+++ b/src/include/crc32c.h
@@ -1,8 +1,7 @@
#ifndef CEPH_CRC32C_H
#define CEPH_CRC32C_H
-#include "include/int_types.h"
-
+#include <inttypes.h>
#include <string.h>
typedef uint32_t (*ceph_crc32c_func_t)(uint32_t crc, unsigned char const *data, unsigned length);
diff --git a/src/include/histogram.h b/src/include/histogram.h
new file mode 100644
index 00000000000..c817b1ec175
--- /dev/null
+++ b/src/include/histogram.h
@@ -0,0 +1,76 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ * Copyright 2013 Inktank
+ */
+
+#ifndef HISTOGRAM_H_
+#define HISTOGRAM_H_
+
+/**
+ * power of 2 histogram
+ */
+struct pow2_hist_t { //
+ /**
+ * histogram
+ *
+ * bin size is 2^index
+ * value is count of elements that are <= the current bin but > the previous bin.
+ */
+ vector<int32_t> h;
+
+private:
+ /// expand to at least another's size
+ void _expand_to(unsigned s) {
+ if (s > h.size())
+ h.resize(s, 0);
+ }
+ /// drop useless trailing 0's
+ void _contract() {
+ unsigned p = h.size();
+ while (p > 0 && h[p-1] == 0)
+ --p;
+ h.resize(p);
+ }
+
+public:
+ void clear() {
+ h.clear();
+ }
+ void set(int bin, int32_t v) {
+ _expand_to(bin + 1);
+ h[bin] = v;
+ _contract();
+ }
+
+ void add(const pow2_hist_t& o) {
+ _expand_to(o.h.size());
+ for (unsigned p = 0; p < o.h.size(); ++p)
+ h[p] += o.h[p];
+ _contract();
+ }
+ void sub(const pow2_hist_t& o) {
+ _expand_to(o.h.size());
+ for (unsigned p = 0; p < o.h.size(); ++p)
+ h[p] -= o.h[p];
+ _contract();
+ }
+
+ int32_t upper_bound() const {
+ return 1 << h.size();
+ }
+
+ void dump(Formatter *f) const;
+ void encode(bufferlist &bl) const;
+ void decode(bufferlist::iterator &bl);
+ static void generate_test_instances(std::list<pow2_hist_t*>& o);
+};
+WRITE_CLASS_ENCODER(pow2_hist_t)
+
+#endif /* HISTOGRAM_H_ */
diff --git a/src/include/rados.h b/src/include/rados.h
index 178c171c445..e7a32b5afef 100644
--- a/src/include/rados.h
+++ b/src/include/rados.h
@@ -219,6 +219,8 @@ enum {
CEPH_OSD_OP_COPY_FROM = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 26,
CEPH_OSD_OP_COPY_GET = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 27,
+ CEPH_OSD_OP_UNDIRTY = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 28,
+ CEPH_OSD_OP_ISDIRTY = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 29,
/** multi **/
CEPH_OSD_OP_CLONERANGE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_MULTI | 1,
diff --git a/src/include/rados/librados.h b/src/include/rados/librados.h
index a85ef3057bc..515663c2335 100644
--- a/src/include/rados/librados.h
+++ b/src/include/rados/librados.h
@@ -1,8 +1,6 @@
#ifndef CEPH_LIBRADOS_H
#define CEPH_LIBRADOS_H
-#include "include/int_types.h"
-
#ifdef __cplusplus
extern "C" {
#endif
@@ -566,7 +564,7 @@ int rados_pool_create_with_auid(rados_t cluster, const char *pool_name, uint64_t
* @returns 0 on success, negative error code on failure
*/
int rados_pool_create_with_crush_rule(rados_t cluster, const char *pool_name,
- __u8 crush_rule_num);
+ uint8_t crush_rule_num);
/**
* Create a pool with a specific CRUSH rule and auid
@@ -581,7 +579,7 @@ int rados_pool_create_with_crush_rule(rados_t cluster, const char *pool_name,
* @returns 0 on success, negative error code on failure
*/
int rados_pool_create_with_all(rados_t cluster, const char *pool_name, uint64_t auid,
- __u8 crush_rule_num);
+ uint8_t crush_rule_num);
/**
* Delete a pool and all data inside it
diff --git a/src/include/rados/librados.hpp b/src/include/rados/librados.hpp
index 94d3d23a824..3f6d025ff41 100644
--- a/src/include/rados/librados.hpp
+++ b/src/include/rados/librados.hpp
@@ -133,11 +133,16 @@ namespace librados
* BALANCE_READS and LOCALIZE_READS should only be used
* when reading from data you're certain won't change,
* like a snapshot, or where eventual consistency is ok.
+ *
+ * ORDER_READS_WRITES will order reads the same way writes are
+ * ordered (e.g., waiting for degraded objects). In particular, it
+ * will make a write followed by a read sequence be preserved.
*/
enum ObjectOperationGlobalFlags {
OPERATION_NOFLAG = 0,
OPERATION_BALANCE_READS = 1,
OPERATION_LOCALIZE_READS = 2,
+ OPERATION_ORDER_READS_WRITES = 4,
};
/*
@@ -278,6 +283,13 @@ namespace librados
*/
void copy_from(const std::string& src, const IoCtx& src_ioctx, uint64_t src_version);
+ /**
+ * undirty an object
+ *
+ * Clear an objects dirty flag
+ */
+ void undirty();
+
friend class IoCtx;
};
@@ -396,6 +408,14 @@ namespace librados
*/
void list_snaps(snap_set_t *out_snaps, int *prval);
+ /**
+ * query dirty state of an object
+ *
+ * @param out_dirty [out] pointer to resulting bool
+ * @param prval [out] place error code in prval upon completion
+ */
+ void is_dirty(bool *isdirty, int *prval);
+
};
/* IoCtx : This is a context in which we can perform I/O.
diff --git a/src/include/types.h b/src/include/types.h
index 1f9756b22c7..5a9e6f6d4c9 100644
--- a/src/include/types.h
+++ b/src/include/types.h
@@ -379,7 +379,7 @@ inline ostream& operator<<(ostream& out, const prettybyte_t& b)
if (b.v > bump_after << 20)
return out << (b.v >> 20) << " MB";
if (b.v > bump_after << 10)
- return out << (b.v >> 10) << " KB";
+ return out << (b.v >> 10) << " kB";
return out << b.v << " bytes";
}
@@ -402,7 +402,7 @@ inline ostream& operator<<(ostream& out, const si_t& b)
if (b.v > bump_after << 20)
return out << (b.v >> 20) << "M";
if (b.v > bump_after << 10)
- return out << (b.v >> 10) << "K";
+ return out << (b.v >> 10) << "k";
return out << b.v;
}
@@ -425,7 +425,7 @@ inline ostream& operator<<(ostream& out, const pretty_si_t& b)
if (b.v > bump_after << 20)
return out << (b.v >> 20) << " M";
if (b.v > bump_after << 10)
- return out << (b.v >> 10) << " K";
+ return out << (b.v >> 10) << " k";
return out << b.v << " ";
}
@@ -445,7 +445,7 @@ inline ostream& operator<<(ostream& out, const kb_t& kb)
return out << (kb.v >> 20) << " GB";
if (kb.v > bump_after << 10)
return out << (kb.v >> 10) << " MB";
- return out << kb.v << " KB";
+ return out << kb.v << " kB";
}
inline ostream& operator<<(ostream& out, const ceph_mon_subscribe_item& i)
diff --git a/src/init-ceph.in b/src/init-ceph.in
index 3a404a46c6f..46877d75558 100644
--- a/src/init-ceph.in
+++ b/src/init-ceph.in
@@ -80,7 +80,7 @@ stop_daemon() {
action=$5
[ -z "$action" ] && action="Stopping"
echo -n "$action Ceph $name on $host..."
- do_cmd "while [ 1 ]; do
+ do_cmd "while [ 1 ]; do
[ -e $pidfile ] || break
pid=\`cat $pidfile\`
while [ -e /proc/\$pid ] && grep -q $daemon /proc/\$pid/cmdline ; do
@@ -172,6 +172,14 @@ command=$1
get_local_name_list
get_name_list "$@"
+# Reverse the order if we are stopping
+if [ "$command" = "stop" ]; then
+ for f in $what; do
+ new_order="$f $new_order"
+ done
+ what="$new_order"
+fi
+
for name in $what; do
type=`echo $name | cut -c 1-3` # e.g. 'mon', if $item is 'mon1'
id=`echo $name | cut -c 4- | sed 's/^\\.//'`
@@ -251,18 +259,18 @@ for name in $what; do
wrap=""
runmode=""
runarg=""
-
+
[ -z "$docrun" ] && get_conf_bool docrun "0" "restart on core dump"
[ "$docrun" -eq 1 ] && wrap="$BINDIR/ceph-run"
-
+
[ -z "$dovalgrind" ] && get_conf_bool valgrind "" "valgrind"
[ -n "$valgrind" ] && wrap="$wrap valgrind $valgrind"
-
+
[ -n "$wrap" ] && runmode="-f &" && runarg="-f"
[ -n "$max_open_files" ] && files="ulimit -n $max_open_files;"
cmd="$files $wrap $cmd $runmode"
-
+
if [ $dofsmount -eq 1 ] && [ -n "$fs_devs" ]; then
get_conf pre_mount "true" "pre mount command"
get_conf fs_type "" "osd mkfs type"
@@ -361,7 +369,7 @@ for name in $what; do
[ -n "$post_start" ] && do_cmd "$post_start"
[ -n "$lockfile" ] && [ "$?" -eq 0 ] && touch $lockfile
;;
-
+
stop)
get_conf pre_stop "" "pre stop command"
get_conf post_stop "" "post stop command"
@@ -402,13 +410,13 @@ for name in $what; do
[ -n "$post_forcestop" ] && do_cmd "$post_forcestop"
[ -n "$lockfile" ] && [ "$?" -eq 0 ] && rm -f $lockfile
;;
-
+
killall)
echo "killall ceph-$type on $host"
do_cmd "pkill ^ceph-$type || true"
[ -n "$lockfile" ] && [ "$?" -eq 0 ] && rm -f $lockfile
;;
-
+
force-reload | reload)
signal_daemon $name ceph-$type $pid_file -1 "Reloading"
;;
diff --git a/src/librados/RadosClient.cc b/src/librados/RadosClient.cc
index 8a5f499ec15..1be3ebd10f9 100644
--- a/src/librados/RadosClient.cc
+++ b/src/librados/RadosClient.cc
@@ -459,7 +459,6 @@ int librados::RadosClient::pool_create_async(string& name, PoolAsyncCompletionIm
Context *onfinish = new C_PoolAsync_Safe(c);
int r = objecter->create_pool(name, onfinish, auid, crush_rule);
if (r < 0) {
- delete c;
delete onfinish;
}
return r;
@@ -505,7 +504,6 @@ int librados::RadosClient::pool_delete_async(const char *name, PoolAsyncCompleti
Context *onfinish = new C_PoolAsync_Safe(c);
int r = objecter->delete_pool(tmp_pool_id, onfinish);
if (r < 0) {
- delete c;
delete onfinish;
}
return r;
diff --git a/src/librados/librados.cc b/src/librados/librados.cc
index 852228ed383..217a0a7bfb2 100644
--- a/src/librados/librados.cc
+++ b/src/librados/librados.cc
@@ -269,6 +269,14 @@ void librados::ObjectReadOperation::list_snaps(
o->list_snaps(out_snaps, prval);
}
+void librados::ObjectReadOperation::is_dirty(bool *is_dirty, int *prval)
+{
+ ::ObjectOperation *o = (::ObjectOperation *)impl;
+ o->is_dirty(is_dirty, prval);
+}
+
+
+
int librados::IoCtx::omap_get_vals(const std::string& oid,
const std::string& start_after,
const std::string& filter_prefix,
@@ -390,6 +398,12 @@ void librados::ObjectWriteOperation::copy_from(const std::string& src,
o->copy_from(object_t(src), src_ioctx.io_ctx_impl->snap_seq, src_ioctx.io_ctx_impl->oloc, src_version);
}
+void librados::ObjectWriteOperation::undirty()
+{
+ ::ObjectOperation *o = (::ObjectOperation *)impl;
+ o->undirty();
+}
+
void librados::ObjectWriteOperation::tmap_put(const bufferlist &bl)
{
::ObjectOperation *o = (::ObjectOperation *)impl;
@@ -958,6 +972,8 @@ int librados::IoCtx::aio_operate(const std::string& oid, AioCompletion *c,
op_flags |= CEPH_OSD_FLAG_BALANCE_READS;
if (flags & OPERATION_LOCALIZE_READS)
op_flags |= CEPH_OSD_FLAG_LOCALIZE_READS;
+ if (flags & OPERATION_ORDER_READS_WRITES)
+ op_flags |= CEPH_OSD_FLAG_RWORDERED;
return io_ctx_impl->aio_operate_read(obj, (::ObjectOperation*)o->impl, c->pc,
op_flags, pbl);
diff --git a/src/mds/CDentry.cc b/src/mds/CDentry.cc
index 5ff6e61fbe0..05766587930 100644
--- a/src/mds/CDentry.cc
+++ b/src/mds/CDentry.cc
@@ -567,4 +567,14 @@ void CDentry::remove_client_lease(ClientLease *l, Locker *locker)
locker->eval_gather(&lock);
}
-
+void CDentry::_put()
+{
+ if (get_num_ref() <= (int)is_dirty() + 1) {
+ CDentry::linkage_t *dnl = get_projected_linkage();
+ if (dnl->is_primary()) {
+ CInode *in = dnl->get_inode();
+ if (get_num_ref() == (int)is_dirty() + !!in->get_num_ref())
+ in->mdcache->maybe_eval_stray(in, true);
+ }
+ }
+}
diff --git a/src/mds/CDentry.h b/src/mds/CDentry.h
index 0d2445a525f..e40854adfaa 100644
--- a/src/mds/CDentry.h
+++ b/src/mds/CDentry.h
@@ -76,6 +76,8 @@ public:
static const int STATE_FRAGMENTING = (1<<1);
static const int STATE_PURGING = (1<<2);
static const int STATE_BADREMOTEINO = (1<<3);
+ // stray dentry needs notification of releasing reference
+ static const int STATE_STRAY = STATE_NOTIFYREF;
// -- pins --
static const int PIN_INODEPIN = 1; // linked inode is pinned
@@ -146,6 +148,7 @@ protected:
public:
elist<CDentry*>::item item_dirty;
+ elist<CDentry*>::item item_stray;
protected:
int auth_pins, nested_auth_pins;
@@ -254,6 +257,7 @@ public:
void last_put() {
lru_unpin();
}
+ void _put();
// auth pins
bool can_auth_pin();
diff --git a/src/mds/CDir.cc b/src/mds/CDir.cc
index 2b991d78fde..4a5e636d9a6 100644
--- a/src/mds/CDir.cc
+++ b/src/mds/CDir.cc
@@ -27,7 +27,7 @@
#include "MDLog.h"
#include "LogSegment.h"
-#include "include/bloom_filter.hpp"
+#include "common/bloom_filter.hpp"
#include "include/Context.h"
#include "common/Clock.h"
@@ -655,6 +655,14 @@ void CDir::remove_null_dentries() {
assert(get_num_any() == items.size());
}
+void CDir::touch_dentries_bottom() {
+ dout(12) << "touch_dentries_bottom " << *this << dendl;
+
+ for (CDir::map_t::iterator p = items.begin();
+ p != items.end();
+ ++p)
+ inode->mdcache->touch_dentry_bottom(p->second);
+}
bool CDir::try_trim_snap_dentry(CDentry *dn, const set<snapid_t>& snaps)
{
@@ -1461,6 +1469,7 @@ void CDir::_fetched(bufferlist &bl, const string& want_dn)
}
bool purged_any = false;
+ bool stray = inode->is_stray();
//int num_new_inodes_loaded = 0;
loff_t baseoff = p.get_off();
@@ -1605,6 +1614,12 @@ void CDir::_fetched(bufferlist &bl, const string& want_dn)
if (in->inode.is_dirty_rstat())
in->mark_dirty_rstat();
+ if (stray) {
+ dn->state_set(CDentry::STATE_STRAY);
+ if (in->inode.nlink == 0)
+ in->state_set(CInode::STATE_ORPHAN);
+ }
+
//in->hack_accessed = false;
//in->hack_load_stamp = ceph_clock_now(g_ceph_context);
//num_new_inodes_loaded++;
diff --git a/src/mds/CDir.h b/src/mds/CDir.h
index 7cf2b6a43d7..86da4e5dfd3 100644
--- a/src/mds/CDir.h
+++ b/src/mds/CDir.h
@@ -357,6 +357,7 @@ private:
void remove_null_dentries();
void purge_stale_snap_data(const set<snapid_t>& snaps);
public:
+ void touch_dentries_bottom();
bool try_trim_snap_dentry(CDentry *dn, const set<snapid_t>& snaps);
diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc
index 46f8d33cfd8..7accc5a4dba 100644
--- a/src/mds/CInode.cc
+++ b/src/mds/CInode.cc
@@ -682,6 +682,12 @@ void CInode::last_put()
parent->put(CDentry::PIN_INODEPIN);
}
+void CInode::_put()
+{
+ if (get_num_ref() == (int)is_dirty() + (int)is_dirty_parent())
+ mdcache->maybe_eval_stray(this, true);
+}
+
void CInode::add_remote_parent(CDentry *p)
{
if (remote_parents.empty())
@@ -1073,7 +1079,6 @@ void CInode::_stored_backtrace(version_t v, Context *fin)
clear_dirty_parent();
if (fin)
fin->complete(0);
- mdcache->maybe_eval_stray(this);
}
void CInode::_mark_dirty_parent(LogSegment *ls, bool dirty_pool)
diff --git a/src/mds/CInode.h b/src/mds/CInode.h
index 8e760220c14..1c2a9339c1c 100644
--- a/src/mds/CInode.h
+++ b/src/mds/CInode.h
@@ -156,6 +156,8 @@ public:
static const int STATE_STRAYPINNED = (1<<16);
static const int STATE_FROZENAUTHPIN = (1<<17);
static const int STATE_DIRTYPOOL = (1<<18);
+ // orphan inode needs notification of releasing reference
+ static const int STATE_ORPHAN = STATE_NOTIFYREF;
static const int MASK_STATE_EXPORTED =
(STATE_DIRTY|STATE_NEEDSRECOVER|STATE_DIRTYPARENT|STATE_DIRTYPOOL);
@@ -812,6 +814,7 @@ public:
}
void first_get();
void last_put();
+ void _put();
// -- hierarchy stuff --
diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc
index 99bd761e0f7..19c9176f414 100644
--- a/src/mds/Locker.cc
+++ b/src/mds/Locker.cc
@@ -1640,9 +1640,6 @@ void Locker::file_update_finish(CInode *in, Mutation *mut, bool share, client_t
share_inode_max_size(in);
}
issue_caps_set(need_issue);
-
- // unlinked stray? may need to purge (e.g., after all caps are released)
- mdcache->maybe_eval_stray(in);
}
Capability* Locker::issue_new_caps(CInode *in,
@@ -3011,8 +3008,6 @@ void Locker::remove_client_cap(CInode *in, client_t client)
}
try_eval(in, CEPH_CAP_LOCKS);
-
- mds->mdcache->maybe_eval_stray(in);
}
diff --git a/src/mds/LogEvent.cc b/src/mds/LogEvent.cc
index b775b6d9501..16e7f803196 100644
--- a/src/mds/LogEvent.cc
+++ b/src/mds/LogEvent.cc
@@ -46,10 +46,16 @@ LogEvent *LogEvent::decode(bufferlist& bl)
::decode(type, p);
if (EVENT_NEW_ENCODING == type) {
- DECODE_START(1, p);
- ::decode(type, p);
- event = decode_event(bl, p, type);
- DECODE_FINISH(p);
+ try {
+ DECODE_START(1, p);
+ ::decode(type, p);
+ event = decode_event(bl, p, type);
+ DECODE_FINISH(p);
+ }
+ catch (const buffer::error &e) {
+ generic_dout(0) << "failed to decode LogEvent (type maybe " << type << ")" << dendl;
+ return NULL;
+ }
} else { // we are using classic encoding
event = decode_event(bl, p, type);
}
diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
index 86b380f2827..0188d418e0d 100644
--- a/src/mds/MDCache.cc
+++ b/src/mds/MDCache.cc
@@ -127,7 +127,8 @@ long g_num_caps = 0;
set<int> SimpleLock::empty_gather_set;
-MDCache::MDCache(MDS *m)
+MDCache::MDCache(MDS *m) :
+ delayed_eval_stray(member_offset(CDentry, item_stray))
{
mds = m;
migrator = new Migrator(mds, this);
@@ -631,7 +632,7 @@ void MDCache::populate_mydir()
CDir *dir = strays[i]->get_dirfrag(fg);
if (!dir)
dir = strays[i]->get_or_open_dirfrag(this, fg);
- if (!dir->is_complete()) {
+ if (dir->get_version() == 0) {
dir->fetch(new C_MDS_RetryOpenRoot(this));
return;
}
@@ -652,6 +653,8 @@ void MDCache::populate_mydir()
assert(!open);
open = true;
mds->queue_waiters(waiting_for_open);
+
+ scan_stray_dir();
}
void MDCache::open_foreign_mdsdir(inodeno_t ino, Context *fin)
@@ -676,6 +679,7 @@ CDentry *MDCache::get_or_create_stray_dentry(CInode *in)
} else
assert(straydn->get_projected_linkage()->is_null());
+ straydn->state_set(CDentry::STATE_STRAY);
return straydn;
}
@@ -5934,8 +5938,9 @@ void MDCache::truncate_inode_finish(CInode *in, LogSegment *ls)
EUpdate *le = new EUpdate(mds->mdlog, "truncate finish");
mds->mdlog->start_entry(le);
- le->metablob.add_dir_context(in->get_parent_dir());
- le->metablob.add_primary_dentry(in->get_projected_parent_dn(), in, true);
+ CDentry *dn = in->get_projected_parent_dn();
+ le->metablob.add_dir_context(dn->get_dir());
+ le->metablob.add_primary_dentry(dn, in, true);
le->metablob.add_truncate_finish(in->ino(), ls->offset);
journal_dirty_inode(mut, &le->metablob, in);
@@ -6017,8 +6022,15 @@ bool MDCache::trim(int max)
}
dout(7) << "trim max=" << max << " cur=" << lru.lru_get_size() << dendl;
- map<int, MCacheExpire*> expiremap;
+ // process delayed eval_stray()
+ for (elist<CDentry*>::iterator p = delayed_eval_stray.begin(); !p.end(); ) {
+ CDentry *dn = *p;
+ ++p;
+ dn->item_stray.remove_myself();
+ eval_stray(dn);
+ }
+ map<int, MCacheExpire*> expiremap;
bool is_standby_replay = mds->is_standby_replay();
int unexpirable = 0;
list<CDentry*> unexpirables;
@@ -6026,13 +6038,12 @@ bool MDCache::trim(int max)
while (lru.lru_get_size() + unexpirable > (unsigned)max) {
CDentry *dn = static_cast<CDentry*>(lru.lru_expire());
if (!dn) break;
- if (is_standby_replay && dn->get_linkage() &&
- dn->get_linkage()->inode->item_open_file.is_on_list()) {
+ if ((is_standby_replay && dn->get_linkage() &&
+ dn->get_linkage()->inode->item_open_file.is_on_list()) ||
+ trim_dentry(dn, expiremap)) {
unexpirables.push_back(dn);
++unexpirable;
- continue;
}
- trim_dentry(dn, expiremap);
}
for(list<CDentry*>::iterator i = unexpirables.begin();
i != unexpirables.end();
@@ -6087,7 +6098,7 @@ void MDCache::send_expire_messages(map<int, MCacheExpire*>& expiremap)
}
-void MDCache::trim_dentry(CDentry *dn, map<int, MCacheExpire*>& expiremap)
+bool MDCache::trim_dentry(CDentry *dn, map<int, MCacheExpire*>& expiremap)
{
dout(12) << "trim_dentry " << *dn << dendl;
@@ -6142,6 +6153,9 @@ void MDCache::trim_dentry(CDentry *dn, map<int, MCacheExpire*>& expiremap)
CInode *in = dnl->get_inode();
assert(in);
trim_inode(dn, in, con, expiremap);
+ // purging stray instead of trimming ?
+ if (dn->get_num_ref() > 0)
+ return true;
}
else {
assert(dnl->is_null());
@@ -6160,6 +6174,7 @@ void MDCache::trim_dentry(CDentry *dn, map<int, MCacheExpire*>& expiremap)
migrator->export_empty_import(dir);
if (mds->logger) mds->logger->inc(l_mds_iex);
+ return false;
}
@@ -6222,7 +6237,14 @@ void MDCache::trim_inode(CDentry *dn, CInode *in, CDir *con, map<int, MCacheExpi
trim_dirfrag(*p, con ? con:*p, expiremap); // if no container (e.g. root dirfrag), use *p
// INODE
- if (!in->is_auth()) {
+ if (in->is_auth()) {
+ // eval stray after closing dirfrags
+ if (dn) {
+ maybe_eval_stray(in);
+ if (dn->get_num_ref() > 0)
+ return;
+ }
+ } else {
pair<int,int> auth = in->authority();
dirfrag_t df;
@@ -6305,6 +6327,12 @@ void MDCache::trim_non_auth()
// add back into lru (at the top)
lru.lru_insert_top(dn);
+ if (dn->get_dir()->get_inode()->is_stray()) {
+ dn->state_set(CDentry::STATE_STRAY);
+ if (dnl->is_primary() && dnl->get_inode()->inode.nlink == 0)
+ dnl->get_inode()->state_set(CInode::STATE_ORPHAN);
+ }
+
if (!first_auth) {
first_auth = dn;
} else {
@@ -6725,9 +6753,6 @@ void MDCache::inode_remove_replica(CInode *in, int from, set<SimpleLock *>& gath
if (in->nestlock.remove_replica(from)) gather_locks.insert(&in->nestlock);
if (in->flocklock.remove_replica(from)) gather_locks.insert(&in->flocklock);
if (in->policylock.remove_replica(from)) gather_locks.insert(&in->policylock);
-
- // trim?
- maybe_eval_stray(in);
}
void MDCache::dentry_remove_replica(CDentry *dn, int from, set<SimpleLock *>& gather_locks)
@@ -6737,10 +6762,6 @@ void MDCache::dentry_remove_replica(CDentry *dn, int from, set<SimpleLock *>& ga
// fix lock
if (dn->lock.remove_replica(from))
gather_locks.insert(&dn->lock);
-
- CDentry::linkage_t *dnl = dn->get_projected_linkage();
- if (dnl->is_primary())
- maybe_eval_stray(dnl->get_inode());
}
void MDCache::trim_client_leases()
@@ -9116,19 +9137,34 @@ void MDCache::_snaprealm_create_finish(MDRequest *mdr, Mutation *mut, CInode *in
// -------------------------------------------------------------------------------
// STRAYS
-void MDCache::scan_stray_dir()
+struct C_MDC_RetryScanStray : public Context {
+ MDCache *cache;
+ dirfrag_t next;
+ C_MDC_RetryScanStray(MDCache *c, dirfrag_t n) : cache(c), next(n) { }
+ void finish(int r) {
+ cache->scan_stray_dir(next);
+ }
+};
+
+void MDCache::scan_stray_dir(dirfrag_t next)
{
- dout(10) << "scan_stray_dir" << dendl;
-
+ dout(10) << "scan_stray_dir " << next << dendl;
+
list<CDir*> ls;
for (int i = 0; i < NUM_STRAY; ++i) {
- if (strays[i]) {
- strays[i]->get_dirfrags(ls);
- }
+ if (strays[i]->ino() < next.ino)
+ continue;
+ strays[i]->get_dirfrags(ls);
}
for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
CDir *dir = *p;
+ if (dir->dirfrag() < next)
+ continue;
+ if (!dir->is_complete()) {
+ dir->fetch(new C_MDC_RetryScanStray(this, dir->dirfrag()));
+ return;
+ }
for (CDir::map_t::iterator q = dir->items.begin(); q != dir->items.end(); ++q) {
CDentry *dn = q->second;
CDentry::linkage_t *dnl = dn->get_projected_linkage();
@@ -9147,7 +9183,7 @@ struct C_MDC_EvalStray : public Context {
}
};
-void MDCache::eval_stray(CDentry *dn)
+void MDCache::eval_stray(CDentry *dn, bool delay)
{
dout(10) << "eval_stray " << *dn << dendl;
CDentry::linkage_t *dnl = dn->get_projected_linkage();
@@ -9211,9 +9247,13 @@ void MDCache::eval_stray(CDentry *dn)
dout(20) << " too many dn refs" << dendl;
return;
}
- purge_stray(dn);
+ if (delay) {
+ if (!dn->item_stray.is_on_list())
+ delayed_eval_stray.push_back(&dn->item_stray);
+ } else
+ purge_stray(dn);
}
- else if (in->inode.nlink == 1) {
+ else if (in->inode.nlink >= 1) {
// trivial reintegrate?
if (!in->remote_parents.empty()) {
CDentry *rlink = *in->remote_parents.begin();
@@ -9257,14 +9297,6 @@ void MDCache::fetch_backtrace(inodeno_t ino, int64_t pool, bufferlist& bl, Conte
mds->objecter->getxattr(oid, object_locator_t(pool), "parent", CEPH_NOSNAP, &bl, 0, fin);
}
-void MDCache::remove_backtrace(inodeno_t ino, int64_t pool, Context *fin)
-{
- SnapContext snapc;
- object_t oid = CInode::get_object_name(ino, frag_t(), "");
- mds->objecter->removexattr(oid, object_locator_t(pool), "parent", snapc,
- ceph_clock_now(g_ceph_context), 0, NULL, fin);
-}
-
class C_MDC_PurgeStrayPurged : public Context {
MDCache *cache;
CDentry *dn;
@@ -9276,94 +9308,6 @@ public:
}
};
-class C_MDC_PurgeForwardingPointers : public Context {
- MDCache *cache;
- CDentry *dn;
-public:
- bufferlist bl;
- C_MDC_PurgeForwardingPointers(MDCache *c, CDentry *d) :
- cache(c), dn(d) {}
- void finish(int r) {
- cache->_purge_forwarding_pointers(bl, dn, r);
- }
-};
-
-class C_MDC_PurgeStray : public Context {
- MDCache *cache;
- CDentry *dn;
-public:
- C_MDC_PurgeStray(MDCache *c, CDentry *d) :
- cache(c), dn(d) {}
- void finish(int r) {
- cache->_purge_stray(dn, r);
- }
-};
-
-void MDCache::_purge_forwarding_pointers(bufferlist& bl, CDentry *dn, int r)
-{
- assert(r == 0 || r == -ENOENT || r == -ENODATA);
- inode_backtrace_t backtrace;
- if (r == 0)
- ::decode(backtrace, bl);
-
- // setup gathering context
- C_GatherBuilder gather_bld(g_ceph_context);
-
- // remove all the objects with forwarding pointer backtraces (aka sentinels)
- for (set<int64_t>::const_iterator i = backtrace.old_pools.begin();
- i != backtrace.old_pools.end();
- ++i) {
- SnapContext snapc;
- object_t oid = CInode::get_object_name(backtrace.ino, frag_t(), "");
- object_locator_t oloc(*i);
-
- mds->objecter->remove(oid, oloc, snapc, ceph_clock_now(g_ceph_context), 0,
- NULL, gather_bld.new_sub());
- }
-
- if (gather_bld.has_subs()) {
- gather_bld.set_finisher(new C_MDC_PurgeStray(this, dn));
- gather_bld.activate();
- } else {
- _purge_stray(dn, r);
- }
-}
-
-void MDCache::_purge_stray(CDentry *dn, int r)
-{
- // purge the strays
- CDentry::linkage_t *dnl = dn->get_projected_linkage();
- CInode *in = dnl->get_inode();
- dout(10) << "_purge_stray " << *dn << " " << *in << dendl;
-
- SnapRealm *realm = in->find_snaprealm();
- SnapContext nullsnap;
- const SnapContext *snapc;
- if (realm) {
- dout(10) << " realm " << *realm << dendl;
- snapc = &realm->get_snap_context();
- } else {
- dout(10) << " NO realm, using null context" << dendl;
- snapc = &nullsnap;
- assert(in->last == CEPH_NOSNAP);
- }
-
- uint64_t period = (uint64_t)in->inode.layout.fl_object_size * (uint64_t)in->inode.layout.fl_stripe_count;
- uint64_t cur_max_size = in->inode.get_max_size();
- uint64_t to = MAX(in->inode.size, cur_max_size);
- if (to && period) {
- uint64_t num = (to + period - 1) / period;
- dout(10) << "purge_stray 0~" << to << " objects 0~" << num << " snapc " << snapc << " on " << *in << dendl;
- mds->filer->purge_range(in->inode.ino, &in->inode.layout, *snapc,
- 0, num, ceph_clock_now(g_ceph_context), 0,
- new C_MDC_PurgeStrayPurged(this, dn));
-
- } else {
- dout(10) << "purge_stray 0 objects snapc " << snapc << " on " << *in << dendl;
- _purge_stray_purged(dn);
- }
-}
-
void MDCache::purge_stray(CDentry *dn)
{
CDentry::linkage_t *dnl = dn->get_projected_linkage();
@@ -9381,24 +9325,90 @@ void MDCache::purge_stray(CDentry *dn)
dn->get(CDentry::PIN_PURGING);
in->state_set(CInode::STATE_PURGING);
-
+ if (dn->item_stray.is_on_list())
+ dn->item_stray.remove_myself();
+
+ if (in->is_dirty_parent())
+ in->clear_dirty_parent();
+
// CHEAT. there's no real need to journal our intent to purge, since
// that is implicit in the dentry's presence and non-use in the stray
// dir. on recovery, we'll need to re-eval all strays anyway.
+ SnapContext nullsnapc;
+ C_GatherBuilder gather(g_ceph_context, new C_MDC_PurgeStrayPurged(this, dn));
+
if (in->is_dir()) {
- dout(10) << "purge_stray dir ... implement me!" << dendl; // FIXME XXX
- // remove the backtrace
- remove_backtrace(in->ino(), mds->mdsmap->get_metadata_pool(),
- new C_MDC_PurgeStrayPurged(this, dn));
- } else if (in->is_file()) {
- // get the backtrace before blowing away the object
- C_MDC_PurgeForwardingPointers *fin = new C_MDC_PurgeForwardingPointers(this, dn);
- fetch_backtrace(in->ino(), in->get_inode().layout.fl_pg_pool, fin->bl, fin);
+ object_locator_t oloc(mds->mdsmap->get_metadata_pool());
+ list<frag_t> ls;
+ if (!in->dirfragtree.is_leaf(frag_t()))
+ in->dirfragtree.get_leaves(ls);
+ ls.push_back(frag_t());
+ for (list<frag_t>::iterator p = ls.begin();
+ p != ls.end();
+ ++p) {
+ object_t oid = CInode::get_object_name(in->inode.ino, *p, "");
+ dout(10) << "purge_stray remove dirfrag " << oid << dendl;
+ mds->objecter->remove(oid, oloc, nullsnapc, ceph_clock_now(g_ceph_context),
+ 0, NULL, gather.new_sub());
+ }
+ assert(gather.has_subs());
+ gather.activate();
+ return;
+ }
+
+ const SnapContext *snapc;
+ SnapRealm *realm = in->find_snaprealm();
+ if (realm) {
+ dout(10) << " realm " << *realm << dendl;
+ snapc = &realm->get_snap_context();
} else {
- // not a dir or file; purged!
- _purge_stray_purged(dn);
+ dout(10) << " NO realm, using null context" << dendl;
+ snapc = &nullsnapc;
+ assert(in->last == CEPH_NOSNAP);
}
+
+ if (in->is_file()) {
+ uint64_t period = (uint64_t)in->inode.layout.fl_object_size *
+ (uint64_t)in->inode.layout.fl_stripe_count;
+ uint64_t to = in->inode.get_max_size();
+ to = MAX(in->inode.size, to);
+ // when truncating a file, the filer does not delete stripe objects that are
+ // truncated to zero. so we need to purge stripe objects up to the max size
+ // the file has ever been.
+ to = MAX(in->inode.max_size_ever, to);
+ if (to && period) {
+ uint64_t num = (to + period - 1) / period;
+ dout(10) << "purge_stray 0~" << to << " objects 0~" << num
+ << " snapc " << snapc << " on " << *in << dendl;
+ mds->filer->purge_range(in->inode.ino, &in->inode.layout, *snapc,
+ 0, num, ceph_clock_now(g_ceph_context), 0,
+ gather.new_sub());
+ }
+ }
+
+ inode_t *pi = in->get_projected_inode();
+ object_t oid = CInode::get_object_name(pi->ino, frag_t(), "");
+ // remove the backtrace object if it was not purged
+ if (!gather.has_subs()) {
+ object_locator_t oloc(pi->layout.fl_pg_pool);
+ dout(10) << "purge_stray remove backtrace object " << oid
+ << " pool " << oloc.pool << " snapc " << snapc << dendl;
+ mds->objecter->remove(oid, oloc, *snapc, ceph_clock_now(g_ceph_context), 0,
+ NULL, gather.new_sub());
+ }
+ // remove old backtrace objects
+ for (vector<int64_t>::iterator p = pi->old_pools.begin();
+ p != pi->old_pools.end();
+ ++p) {
+ object_locator_t oloc(*p);
+ dout(10) << "purge_stray remove backtrace object " << oid
+ << " old pool " << *p << " snapc " << snapc << dendl;
+ mds->objecter->remove(oid, oloc, *snapc, ceph_clock_now(g_ceph_context), 0,
+ NULL, gather.new_sub());
+ }
+ assert(gather.has_subs());
+ gather.activate();
}
class C_MDC_PurgeStrayLogged : public Context {
@@ -9480,9 +9490,6 @@ void MDCache::_purge_stray_logged(CDentry *dn, version_t pdv, LogSegment *ls)
CInode *in = dn->get_linkage()->get_inode();
dout(10) << "_purge_stray_logged " << *dn << " " << *in << dendl;
- dn->state_clear(CDentry::STATE_PURGING);
- dn->put(CDentry::PIN_PURGING);
-
assert(!in->state_test(CInode::STATE_RECOVERING));
// unlink
@@ -9493,11 +9500,13 @@ void MDCache::_purge_stray_logged(CDentry *dn, version_t pdv, LogSegment *ls)
dn->dir->pop_and_dirty_projected_fnode(ls);
+ in->state_clear(CInode::STATE_ORPHAN);
+ dn->state_clear(CDentry::STATE_PURGING);
+ dn->put(CDentry::PIN_PURGING);
+
// drop inode
if (in->is_dirty())
in->mark_clean();
- if (in->is_dirty_parent())
- in->clear_dirty_parent();
remove_inode(in);
@@ -10639,7 +10648,7 @@ void MDCache::handle_dentry_unlink(MDentryUnlink *m)
!in->state_test(CInode::STATE_EXPORTINGCAPS))
migrator->export_caps(in);
- lru.lru_bottouch(straydn); // move stray to end of lru
+ touch_dentry_bottom(straydn); // move stray to end of lru
straydn = NULL;
} else {
assert(!straydn);
@@ -10649,7 +10658,7 @@ void MDCache::handle_dentry_unlink(MDentryUnlink *m)
assert(dnl->is_null());
// move to bottom of lru
- lru.lru_bottouch(dn);
+ touch_dentry_bottom(dn);
}
}
diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h
index b4b57da84b2..416c6454292 100644
--- a/src/mds/MDCache.h
+++ b/src/mds/MDCache.h
@@ -19,6 +19,7 @@
#include "include/types.h"
#include "include/filepath.h"
+#include "include/elist.h"
#include "CInode.h"
#include "CDentry.h"
@@ -564,7 +565,7 @@ public:
// trimming
bool trim(int max = -1); // trim cache
- void trim_dentry(CDentry *dn, map<int, MCacheExpire*>& expiremap);
+ bool trim_dentry(CDentry *dn, map<int, MCacheExpire*>& expiremap);
void trim_dirfrag(CDir *dir, CDir *con,
map<int, MCacheExpire*>& expiremap);
void trim_inode(CDentry *dn, CInode *in, CDir *con,
@@ -646,6 +647,15 @@ public:
}
void touch_dentry_bottom(CDentry *dn) {
lru.lru_bottouch(dn);
+ if (dn->get_projected_linkage()->is_primary()) {
+ CInode *in = dn->get_projected_linkage()->get_inode();
+ if (in->has_dirfrags()) {
+ list<CDir*> ls;
+ in->get_dirfrags(ls);
+ for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p)
+ (*p)->touch_dentries_bottom();
+ }
+ }
}
protected:
@@ -858,31 +868,29 @@ public:
// -- stray --
public:
- void scan_stray_dir();
- void eval_stray(CDentry *dn);
+ elist<CDentry*> delayed_eval_stray;
+
+ void eval_stray(CDentry *dn, bool delay=false);
void eval_remote(CDentry *dn);
- void maybe_eval_stray(CInode *in) {
+ void maybe_eval_stray(CInode *in, bool delay=false) {
if (in->inode.nlink > 0 || in->is_base())
return;
CDentry *dn = in->get_projected_parent_dn();
- if (dn->get_projected_linkage()->is_primary() &&
- dn->get_dir()->get_inode()->is_stray() &&
- !dn->is_replicated())
- eval_stray(dn);
+ if (!dn->state_test(CDentry::STATE_PURGING) &&
+ dn->get_projected_linkage()->is_primary() &&
+ dn->get_dir()->get_inode()->is_stray())
+ eval_stray(dn, delay);
}
protected:
+ void scan_stray_dir(dirfrag_t next=dirfrag_t());
void fetch_backtrace(inodeno_t ino, int64_t pool, bufferlist& bl, Context *fin);
- void remove_backtrace(inodeno_t ino, int64_t pool, Context *fin);
- void _purge_forwarding_pointers(bufferlist& bl, CDentry *dn, int r);
- void _purge_stray(CDentry *dn, int r);
void purge_stray(CDentry *dn);
void _purge_stray_purged(CDentry *dn, int r=0);
void _purge_stray_logged(CDentry *dn, version_t pdv, LogSegment *ls);
void _purge_stray_logged_truncate(CDentry *dn, LogSegment *ls);
+ friend class C_MDC_RetryScanStray;
friend class C_MDC_FetchedBacktrace;
- friend class C_MDC_PurgeForwardingPointers;
- friend class C_MDC_PurgeStray;
friend class C_MDC_PurgeStrayLogged;
friend class C_MDC_PurgeStrayLoggedTruncate;
friend class C_MDC_PurgeStrayPurged;
diff --git a/src/mds/MDLog.cc b/src/mds/MDLog.cc
index 1ace72e0ac3..cacbebfd3f6 100644
--- a/src/mds/MDLog.cc
+++ b/src/mds/MDLog.cc
@@ -499,7 +499,11 @@ void MDLog::_replay_thread()
if (journaler->get_error()) {
r = journaler->get_error();
dout(0) << "_replay journaler got error " << r << ", aborting" << dendl;
- if (r == -EINVAL) {
+ if (r == -ENOENT) {
+ // journal has been trimmed by somebody else?
+ assert(journaler->is_readonly());
+ r = -EAGAIN;
+ } else if (r == -EINVAL) {
if (journaler->get_read_pos() < journaler->get_expire_pos()) {
// this should only happen if you're following somebody else
assert(journaler->is_readonly());
@@ -605,7 +609,7 @@ void MDLog::_replay_thread()
}
dout(10) << "_replay_thread kicking waiters" << dendl;
- finish_contexts(g_ceph_context, waitfor_replay, 0);
+ finish_contexts(g_ceph_context, waitfor_replay, r);
dout(10) << "_replay_thread finish" << dendl;
mds->mds_lock.Unlock();
diff --git a/src/mds/MDS.cc b/src/mds/MDS.cc
index c2e0bbbe369..83722274981 100644
--- a/src/mds/MDS.cc
+++ b/src/mds/MDS.cc
@@ -1525,7 +1525,6 @@ void MDS::active_start()
mdcache->open_root();
mdcache->clean_open_file_lists();
- mdcache->scan_stray_dir();
mdcache->export_remaining_imported_caps();
finish_contexts(g_ceph_context, waiting_for_replay); // kick waiters
finish_contexts(g_ceph_context, waiting_for_active); // kick waiters
diff --git a/src/mds/MDSMap.cc b/src/mds/MDSMap.cc
index 1646a134ad5..f1ab9b112d8 100644
--- a/src/mds/MDSMap.cc
+++ b/src/mds/MDSMap.cc
@@ -470,7 +470,7 @@ void MDSMap::encode(bufferlist& bl, uint64_t features) const
::encode(cas_pool, bl);
// kclient ignores everything from here
- __u16 ev = 5;
+ __u16 ev = 6;
::encode(ev, bl);
::encode(compat, bl);
::encode(metadata_pool, bl);
@@ -483,6 +483,8 @@ void MDSMap::encode(bufferlist& bl, uint64_t features) const
::encode(failed, bl);
::encode(stopped, bl);
::encode(last_failure_osd_epoch, bl);
+ ::encode(ever_allowed_snaps, bl);
+ ::encode(explicitly_allowed_snaps, bl);
ENCODE_FINISH(bl);
}
}
@@ -540,5 +542,12 @@ void MDSMap::decode(bufferlist::iterator& p)
::decode(stopped, p);
if (ev >= 4)
::decode(last_failure_osd_epoch, p);
+ if (ev >= 6) {
+ ::decode(ever_allowed_snaps, p);
+ ::decode(explicitly_allowed_snaps, p);
+ } else {
+ ever_allowed_snaps = true;
+ explicitly_allowed_snaps = false;
+ }
DECODE_FINISH(p);
}
diff --git a/src/mds/MDSMap.h b/src/mds/MDSMap.h
index 5bfc7cc20d5..5eadf156a95 100644
--- a/src/mds/MDSMap.h
+++ b/src/mds/MDSMap.h
@@ -175,6 +175,9 @@ protected:
map<int32_t,uint64_t> up; // who is in those roles
map<uint64_t,mds_info_t> mds_info;
+ bool ever_allowed_snaps; //< the cluster has ever allowed snap creation
+ bool explicitly_allowed_snaps; //< the user has explicitly enabled snap creation
+
public:
CompatSet compat;
@@ -188,7 +191,9 @@ public:
max_file_size(0),
cas_pool(-1),
metadata_pool(0),
- max_mds(0)
+ max_mds(0),
+ ever_allowed_snaps(false),
+ explicitly_allowed_snaps(false)
{ }
utime_t get_session_timeout() {
@@ -201,6 +206,14 @@ public:
void set_flag(int f) { flags |= f; }
void clear_flag(int f) { flags &= ~f; }
+ void set_snaps_allowed() {
+ set_flag(CEPH_MDSMAP_ALLOW_SNAPS);
+ ever_allowed_snaps = true;
+ explicitly_allowed_snaps = true;
+ }
+ bool allows_snaps() { return test_flag(CEPH_MDSMAP_ALLOW_SNAPS); }
+ void clear_snaps_allowed() { clear_flag(CEPH_MDSMAP_ALLOW_SNAPS); }
+
epoch_t get_epoch() const { return epoch; }
void inc_epoch() { epoch++; }
diff --git a/src/mds/Server.cc b/src/mds/Server.cc
index 466d4818456..41862847e27 100644
--- a/src/mds/Server.cc
+++ b/src/mds/Server.cc
@@ -1167,10 +1167,11 @@ void Server::dispatch_client_request(MDRequest *mdr)
// inodes ops.
case CEPH_MDS_OP_LOOKUP:
- case CEPH_MDS_OP_LOOKUPSNAP:
handle_client_getattr(mdr, true);
break;
+ case CEPH_MDS_OP_LOOKUPSNAP:
+ // lookupsnap does not reference a CDentry; treat it as a getattr
case CEPH_MDS_OP_GETATTR:
handle_client_getattr(mdr, false);
break;
@@ -3085,6 +3086,7 @@ void Server::handle_client_file_readlock(MDRequest *mdr)
checking_lock.length = req->head.args.filelock_change.length;
checking_lock.client = req->get_orig_source().num();
checking_lock.pid = req->head.args.filelock_change.pid;
+ checking_lock.pid_namespace = req->head.args.filelock_change.pid_namespace;
checking_lock.type = req->head.args.filelock_change.type;
// get the appropriate lock state
@@ -4909,8 +4911,10 @@ void Server::_unlink_local(MDRequest *mdr, CDentry *dn, CDentry *straydn)
inode_t *pi = in->project_inode();
mdr->add_projected_inode(in); // do this _after_ my dn->pre_dirty().. we apply that one manually.
pi->version = in->pre_dirty();
- pi->nlink--;
pi->ctime = mdr->now;
+ pi->nlink--;
+ if (pi->nlink == 0)
+ in->state_set(CInode::STATE_ORPHAN);
if (dnl->is_primary()) {
// primary link. add stray dentry.
@@ -6054,8 +6058,10 @@ void Server::_rename_prepare(MDRequest *mdr,
pi->nlink--;
}
if (tpi) {
- tpi->nlink--;
tpi->ctime = mdr->now;
+ tpi->nlink--;
+ if (tpi->nlink == 0)
+ oldin->state_set(CInode::STATE_ORPHAN);
}
}
@@ -7157,6 +7163,12 @@ struct C_MDS_mksnap_finish : public Context {
/* This function takes responsibility for the passed mdr*/
void Server::handle_client_mksnap(MDRequest *mdr)
{
+ if (!mds->mdsmap->allows_snaps()) {
+ // you can't make snapshots until you set an option right now
+ reply_request(mdr, -EPERM);
+ return;
+ }
+
MClientRequest *req = mdr->client_request;
CInode *diri = mdcache->get_inode(req->get_filepath().get_ino());
if (!diri || diri->state_test(CInode::STATE_PURGING)) {
diff --git a/src/mds/flock.h b/src/mds/flock.h
index ae93d1660f0..b767fe58507 100644
--- a/src/mds/flock.h
+++ b/src/mds/flock.h
@@ -12,7 +12,7 @@
inline ostream& operator<<(ostream& out, ceph_filelock& l) {
out << "start: " << l.start << ", length: " << l.length
<< ", client: " << l.client << ", pid: " << l.pid
- << ", type: " << (int)l.type
+ << ", pid_ns: " << l.pid_namespace << ", type: " << (int)l.type
<< std::endl;
return out;
}
diff --git a/src/mds/mdstypes.cc b/src/mds/mdstypes.cc
index 6886786f27e..362f74774c4 100644
--- a/src/mds/mdstypes.cc
+++ b/src/mds/mdstypes.cc
@@ -204,7 +204,7 @@ ostream& operator<<(ostream& out, const client_writeable_range_t& r)
*/
void inode_t::encode(bufferlist &bl) const
{
- ENCODE_START(7, 6, bl);
+ ENCODE_START(8, 6, bl);
::encode(ino, bl);
::encode(rdev, bl);
@@ -238,6 +238,7 @@ void inode_t::encode(bufferlist &bl) const
::encode(xattr_version, bl);
::encode(backtrace_version, bl);
::encode(old_pools, bl);
+ ::encode(max_size_ever, bl);
ENCODE_FINISH(bl);
}
@@ -294,6 +295,8 @@ void inode_t::decode(bufferlist::iterator &p)
::decode(backtrace_version, p);
if (struct_v >= 7)
::decode(old_pools, p);
+ if (struct_v >= 8)
+ ::decode(max_size_ever, p);
DECODE_FINISH(p);
}
diff --git a/src/mds/mdstypes.h b/src/mds/mdstypes.h
index 902e3104aa8..bd53c85b48d 100644
--- a/src/mds/mdstypes.h
+++ b/src/mds/mdstypes.h
@@ -329,6 +329,7 @@ struct inode_t {
ceph_file_layout layout;
vector <int64_t> old_pools;
uint64_t size; // on directory, # dentries
+ uint64_t max_size_ever; // max size the file has ever been
uint32_t truncate_seq;
uint64_t truncate_size, truncate_from;
uint32_t truncate_pending;
@@ -353,7 +354,8 @@ struct inode_t {
inode_t() : ino(0), rdev(0),
mode(0), uid(0), gid(0),
nlink(0), anchored(false),
- size(0), truncate_seq(0), truncate_size(0), truncate_from(0),
+ size(0), max_size_ever(0),
+ truncate_seq(0), truncate_size(0), truncate_from(0),
truncate_pending(0),
time_warp_seq(0),
version(0), file_data_version(0), xattr_version(0), backtrace_version(0) {
@@ -369,6 +371,8 @@ struct inode_t {
bool is_truncating() const { return (truncate_pending > 0); }
void truncate(uint64_t old_size, uint64_t new_size) {
assert(new_size < old_size);
+ if (old_size > max_size_ever)
+ max_size_ever = old_size;
truncate_from = old_size;
size = new_size;
rstat.rbytes = new_size;
@@ -1134,8 +1138,9 @@ class MDSCacheObject {
// -- state --
const static int STATE_AUTH = (1<<30);
const static int STATE_DIRTY = (1<<29);
- const static int STATE_REJOINING = (1<<28); // replica has not joined w/ primary copy
- const static int STATE_REJOINUNDEF = (1<<27); // contents undefined.
+ const static int STATE_NOTIFYREF = (1<<28); // notify dropping ref drop through _put()
+ const static int STATE_REJOINING = (1<<27); // replica has not joined w/ primary copy
+ const static int STATE_REJOINUNDEF = (1<<26); // contents undefined.
// -- wait --
@@ -1221,6 +1226,7 @@ protected:
#endif
assert(ref > 0);
}
+ virtual void _put() {}
void put(int by) {
#ifdef MDS_REF_SET
if (ref == 0 || ref_map[by] == 0) {
@@ -1236,6 +1242,8 @@ protected:
#endif
if (ref == 0)
last_put();
+ if (state_test(STATE_NOTIFYREF))
+ _put();
}
}
diff --git a/src/mon/MDSMonitor.cc b/src/mon/MDSMonitor.cc
index b2273274521..48c1c99d584 100644
--- a/src/mon/MDSMonitor.cc
+++ b/src/mon/MDSMonitor.cc
@@ -920,6 +920,36 @@ bool MDSMonitor::prepare_command(MMonCommand *m)
r = 0;
}
+ } else if (prefix == "mds set") {
+ string key;
+ cmd_getval(g_ceph_context, cmdmap, "key", key);
+ string sure;
+ cmd_getval(g_ceph_context, cmdmap, "sure", sure);
+ if (key == "allow_new_snaps") {
+ if (sure != "--yes-i-really-mean-it") {
+ ss << "Snapshots are unstable and will probably break your FS! Add --yes-i-really-mean-it if you are sure";
+ r = -EPERM;
+ } else {
+ pending_mdsmap.set_snaps_allowed();
+ ss << "turned on snaps";
+ r = 0;
+ }
+ }
+ } else if (prefix == "mds unset") {
+ string key;
+ cmd_getval(g_ceph_context, cmdmap, "key", key);
+ string sure;
+ cmd_getval(g_ceph_context, cmdmap, "sure", sure);
+ if (key == "allow_new_snaps") {
+ if (sure != "--yes-i-really-mean-it") {
+ ss << "this won't get rid of snapshots or restore the cluster if it's broken. Add --yes-i-really-mean-it if you are sure";
+ r = -EPERM;
+ } else {
+ pending_mdsmap.clear_snaps_allowed();
+ ss << "disabled new snapshots";
+ r = 0;
+ }
+ }
} else if (prefix == "mds add_data_pool") {
int64_t poolid;
cmd_getval(g_ceph_context, cmdmap, "poolid", poolid);
diff --git a/src/mon/MonCommands.h b/src/mon/MonCommands.h
index 365fd28b64e..33e00a98d30 100644
--- a/src/mon/MonCommands.h
+++ b/src/mon/MonCommands.h
@@ -59,7 +59,7 @@
* CephString: optional badchars
* CephSocketpath: validation involves "is it S_ISSOCK"
* CephIPAddr: v4 or v6 addr with optional port, syntax validated
- * CephEntityAddr: CephIPAddr + '/nonce'
+ * CephEntityAddr: CephIPAddr + optional '/nonce'
* CephPoolname: Plainold string
* CephObjectname: Another plainold string
* CephPgid: n.xxx where n is an int > 0, xxx is a hex number > 0
@@ -210,8 +210,8 @@ COMMAND("quorum_status", "report status of monitor quorum", \
"mon", "r", "cli,rest")
COMMAND("mon_status", "report status of monitors", "mon", "r", "cli,rest")
COMMAND("sync force " \
- "name=validate1,type=CephChoices,strings=--yes-i-really-mean-it " \
- "name=validate2,type=CephChoices,strings=--i-know-what-i-am-doing", \
+ "name=validate1,type=CephChoices,strings=--yes-i-really-mean-it,req=false " \
+ "name=validate2,type=CephChoices,strings=--i-know-what-i-am-doing,req=false", \
"force sync of and clear monitor store", "mon", "rw", "cli,rest")
COMMAND("heap " \
"name=heapcmd,type=CephChoices,strings=dump|start_profiler|stop_profiler|release|stats", \
@@ -274,6 +274,15 @@ COMMAND("mds compat rm_compat " \
COMMAND("mds compat rm_incompat " \
"name=feature,type=CephInt,range=0", \
"remove incompatible feature", "mds", "rw", "cli,rest")
+COMMAND("mds set " \
+ "name=key,type=CephChoices,strings=allow_new_snaps " \
+ "name=sure,type=CephString,req=false", \
+ "set <key>", \
+ "mds", "w", "cli,rest")
+COMMAND("mds unset " \
+ "name=key,type=CephChoices,strings=allow_new_snaps " \
+ "name=sure,type=CephString,req=false", \
+ "unset <key>", "mds", "w", "cli,rest")
COMMAND("mds add_data_pool " \
"name=poolid,type=CephInt,range=0", \
"add data pool <poolid>", "mds", "rw", "cli,rest")
@@ -283,14 +292,14 @@ COMMAND("mds remove_data_pool " \
COMMAND("mds newfs " \
"name=metadata,type=CephInt,range=0 " \
"name=data,type=CephInt,range=0 " \
- "name=sure,type=CephChoices,strings=--yes-i-really-mean-it", \
+ "name=sure,type=CephChoices,strings=--yes-i-really-mean-it,req=false", \
"make new filesystom using pools <metadata> and <data>", \
"mds", "rw", "cli,rest")
/*
* Monmap commands
*/
COMMAND("mon dump " \
- "name=epoch,type=CephInt,req=false", \
+ "name=epoch,type=CephInt,range=0,req=false", \
"dump formatted monmap (optionally from epoch)", \
"mon", "r", "cli,rest")
COMMAND("mon stat", "summarize monitor status", "mon", "r", "cli,rest")
@@ -456,7 +465,7 @@ COMMAND("osd reweight " \
"reweight osd to 0.0 < <weight> < 1.0", "osd", "rw", "cli,rest")
COMMAND("osd lost " \
"name=id,type=CephInt,range=0 " \
- "name=sure,type=CephChoices,strings=--yes-i-really-mean-it", \
+ "name=sure,type=CephChoices,strings=--yes-i-really-mean-it,req=false", \
"mark osd as permanently lost. THIS DESTROYS DATA IF NO MORE REPLICAS EXIST, BE CAREFUL", \
"osd", "rw", "cli,rest")
COMMAND("osd create " \
@@ -484,9 +493,9 @@ COMMAND("osd pool create " \
"create pool", "osd", "rw", "cli,rest")
COMMAND("osd pool delete " \
"name=pool,type=CephPoolname " \
- "name=pool2,type=CephPoolname " \
- "name=sure,type=CephChoices,strings=--yes-i-really-really-mean-it", \
- "delete pool (say pool twice, add --yes-i-really-really-mean-it)", \
+ "name=pool2,type=CephPoolname,req=false " \
+ "name=sure,type=CephChoices,strings=--yes-i-really-really-mean-it,req=false", \
+ "delete pool", \
"osd", "rw", "cli,rest")
COMMAND("osd pool rename " \
"name=srcpool,type=CephPoolname " \
diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc
index 10f5bfb149c..d8c90bc3d76 100644
--- a/src/mon/Monitor.cc
+++ b/src/mon/Monitor.cc
@@ -1854,13 +1854,7 @@ void Monitor::get_status(stringstream &ss, Formatter *f)
}
#undef COMMAND
-struct MonCommand {
- string cmdstring;
- string helpstring;
- string module;
- string req_perms;
- string availability;
-} mon_commands[] = {
+MonCommand mon_commands[] = {
#define COMMAND(parsesig, helptext, modulename, req_perms, avail) \
{parsesig, helptext, modulename, req_perms, avail},
#include <mon/MonCommands.h>
@@ -1909,6 +1903,26 @@ bool Monitor::_allowed_command(MonSession *s, string &module, string &prefix,
return capable;
}
+void get_command_descriptions(const MonCommand *commands,
+ unsigned commands_size,
+ Formatter *f,
+ bufferlist *rdata) {
+ int cmdnum = 0;
+ f->open_object_section("command_descriptions");
+ for (const MonCommand *cp = commands;
+ cp < &commands[commands_size]; cp++) {
+
+ ostringstream secname;
+ secname << "cmd" << setfill('0') << std::setw(3) << cmdnum;
+ dump_cmddesc_to_json(f, secname.str(),
+ cp->cmdstring, cp->helpstring, cp->module,
+ cp->req_perms, cp->availability);
+ cmdnum++;
+ }
+ f->close_section(); // command_descriptions
+
+ f->flush(*rdata);
+}
void Monitor::handle_command(MMonCommand *m)
{
@@ -1953,23 +1967,9 @@ void Monitor::handle_command(MMonCommand *m)
cmd_getval(g_ceph_context, cmdmap, "prefix", prefix);
if (prefix == "get_command_descriptions") {
- int cmdnum = 0;
- Formatter *f = new_formatter("json");
- f->open_object_section("command_descriptions");
- for (MonCommand *cp = mon_commands;
- cp < &mon_commands[ARRAY_SIZE(mon_commands)]; cp++) {
-
- ostringstream secname;
- secname << "cmd" << setfill('0') << std::setw(3) << cmdnum;
- dump_cmddesc_to_json(f, secname.str(),
- cp->cmdstring, cp->helpstring, cp->module,
- cp->req_perms, cp->availability);
- cmdnum++;
- }
- f->close_section(); // command_descriptions
-
bufferlist rdata;
- f->flush(rdata);
+ Formatter *f = new_formatter("json");
+ get_command_descriptions(mon_commands, ARRAY_SIZE(mon_commands), f, &rdata);
delete f;
reply_command(m, 0, "", rdata, 0);
return;
@@ -2561,67 +2561,98 @@ bool Monitor::_ms_dispatch(Message *m)
EntityName entity_name;
bool src_is_mon;
- src_is_mon = !connection || (connection->get_peer_type() & CEPH_ENTITY_TYPE_MON);
-
- if (connection) {
- bool reuse_caps = false;
- dout(20) << "have connection" << dendl;
- s = static_cast<MonSession *>(connection->get_priv());
- if (s && s->closed) {
- caps = s->caps;
- reuse_caps = true;
- s->put();
- s = NULL;
+ // regardless of who we are or who the sender is, the message must
+ // have a connection associated. If it doesn't then something fishy
+ // is going on.
+ assert(connection);
+
+ src_is_mon = (connection->get_peer_type() & CEPH_ENTITY_TYPE_MON);
+
+ bool reuse_caps = false;
+ dout(20) << "have connection" << dendl;
+ s = static_cast<MonSession *>(connection->get_priv());
+ if (s && s->closed) {
+ caps = s->caps;
+ reuse_caps = true;
+ s->put();
+ s = NULL;
+ }
+ if (!s) {
+ // if the sender is not a monitor, make sure their first message for a
+ // session is an MAuth. If it is not, assume it's a stray message,
+ // and considering that we are creating a new session it is safe to
+ // assume that the sender hasn't authenticated yet, so we have no way
+ // of assessing whether we should handle it or not.
+ if (!src_is_mon && (m->get_type() != CEPH_MSG_AUTH &&
+ m->get_type() != CEPH_MSG_MON_GET_MAP)) {
+ dout(1) << __func__ << " dropping stray message " << *m
+ << " from " << m->get_source_inst() << dendl;
+ return false;
}
- if (!s) {
- if (!exited_quorum.is_zero() && !src_is_mon) {
- waitlist_or_zap_client(m);
- return true;
- }
- dout(10) << "do not have session, making new one" << dendl;
- s = session_map.new_session(m->get_source_inst(), m->get_connection().get());
- m->get_connection()->set_priv(s->get());
- dout(10) << "ms_dispatch new session " << s << " for " << s->inst << dendl;
-
- if (m->get_connection()->get_peer_type() != CEPH_ENTITY_TYPE_MON) {
- dout(10) << "setting timeout on session" << dendl;
- // set an initial timeout here, so we will trim this session even if they don't
- // do anything.
- s->until = ceph_clock_now(g_ceph_context);
- s->until += g_conf->mon_subscribe_interval;
- } else {
- //give it monitor caps; the peer type has been authenticated
- reuse_caps = false;
- dout(5) << "setting monitor caps on this connection" << dendl;
- if (!s->caps.is_allow_all()) //but no need to repeatedly copy
- s->caps = *mon_caps;
- }
- if (reuse_caps)
- s->caps = caps;
+
+ if (!exited_quorum.is_zero() && !src_is_mon) {
+ waitlist_or_zap_client(m);
+ return true;
+ }
+
+ dout(10) << "do not have session, making new one" << dendl;
+ s = session_map.new_session(m->get_source_inst(), m->get_connection().get());
+ m->get_connection()->set_priv(s->get());
+ dout(10) << "ms_dispatch new session " << s << " for " << s->inst << dendl;
+
+ if (!src_is_mon) {
+ dout(10) << "setting timeout on session" << dendl;
+ // set an initial timeout here, so we will trim this session even if they don't
+ // do anything.
+ s->until = ceph_clock_now(g_ceph_context);
+ s->until += g_conf->mon_subscribe_interval;
} else {
- dout(20) << "ms_dispatch existing session " << s << " for " << s->inst << dendl;
+ //give it monitor caps; the peer type has been authenticated
+ reuse_caps = false;
+ dout(5) << "setting monitor caps on this connection" << dendl;
+ if (!s->caps.is_allow_all()) //but no need to repeatedly copy
+ s->caps = *mon_caps;
}
+ if (reuse_caps)
+ s->caps = caps;
+ } else {
+ dout(20) << "ms_dispatch existing session " << s << " for " << s->inst << dendl;
+ }
+
+ if (s) {
if (s->auth_handler) {
entity_name = s->auth_handler->get_entity_name();
}
- }
-
- if (s)
dout(20) << " caps " << s->caps.get_str() << dendl;
+ }
if (is_synchronizing() && !src_is_mon) {
waitlist_or_zap_client(m);
return true;
}
- {
- switch (m->get_type()) {
-
+ ret = dispatch(s, m, src_is_mon);
+
+ if (s) {
+ s->put();
+ }
+
+ return ret;
+}
+
+bool Monitor::dispatch(MonSession *s, Message *m, const bool src_is_mon)
+{
+ bool ret = true;
+
+ assert(m != NULL);
+
+ switch (m->get_type()) {
+
case MSG_ROUTE:
handle_route(static_cast<MRoute*>(m));
break;
- // misc
+ // misc
case CEPH_MSG_MON_GET_MAP:
handle_mon_get_map(static_cast<MMonGetMap*>(m));
break;
@@ -2647,12 +2678,11 @@ bool Monitor::_ms_dispatch(Message *m)
case MSG_MON_SYNC:
handle_sync(static_cast<MMonSync*>(m));
break;
-
case MSG_MON_SCRUB:
handle_scrub(static_cast<MMonScrub*>(m));
break;
- // OSDs
+ // OSDs
case MSG_OSD_MARK_ME_DOWN:
case MSG_OSD_FAILURE:
case MSG_OSD_BOOT:
@@ -2665,20 +2695,20 @@ bool Monitor::_ms_dispatch(Message *m)
paxos_service[PAXOS_OSDMAP]->dispatch((PaxosServiceMessage*)m);
break;
- // MDSs
+ // MDSs
case MSG_MDS_BEACON:
case MSG_MDS_OFFLOAD_TARGETS:
paxos_service[PAXOS_MDSMAP]->dispatch((PaxosServiceMessage*)m);
break;
- // auth
+ // auth
case MSG_MON_GLOBAL_ID:
case CEPH_MSG_AUTH:
/* no need to check caps here */
paxos_service[PAXOS_AUTH]->dispatch((PaxosServiceMessage*)m);
break;
- // pg
+ // pg
case CEPH_MSG_STATFS:
case MSG_PGSTATS:
case MSG_GETPOOLSTATS:
@@ -2689,7 +2719,7 @@ bool Monitor::_ms_dispatch(Message *m)
paxos_service[PAXOS_OSDMAP]->dispatch((PaxosServiceMessage*)m);
break;
- // log
+ // log
case MSG_LOG:
paxos_service[PAXOS_LOG]->dispatch((PaxosServiceMessage*)m);
break;
@@ -2698,60 +2728,60 @@ bool Monitor::_ms_dispatch(Message *m)
clog.handle_log_ack((MLogAck*)m);
break;
- // monmap
+ // monmap
case MSG_MON_JOIN:
paxos_service[PAXOS_MONMAP]->dispatch((PaxosServiceMessage*)m);
break;
- // paxos
+ // paxos
case MSG_MON_PAXOS:
{
- MMonPaxos *pm = static_cast<MMonPaxos*>(m);
- if (!src_is_mon &&
- !s->is_capable("mon", MON_CAP_X)) {
- //can't send these!
- pm->put();
- break;
- }
+ MMonPaxos *pm = static_cast<MMonPaxos*>(m);
+ if (!src_is_mon ||
+ !s->is_capable("mon", MON_CAP_X)) {
+ //can't send these!
+ pm->put();
+ break;
+ }
- if (state == STATE_SYNCHRONIZING) {
- // we are synchronizing. These messages would do us no
- // good, thus just drop them and ignore them.
- dout(10) << __func__ << " ignore paxos msg from "
- << pm->get_source_inst() << dendl;
- pm->put();
- break;
- }
+ if (state == STATE_SYNCHRONIZING) {
+ // we are synchronizing. These messages would do us no
+ // good, thus just drop them and ignore them.
+ dout(10) << __func__ << " ignore paxos msg from "
+ << pm->get_source_inst() << dendl;
+ pm->put();
+ break;
+ }
- // sanitize
- if (pm->epoch > get_epoch()) {
- bootstrap();
- pm->put();
- break;
- }
- if (pm->epoch != get_epoch()) {
- pm->put();
- break;
- }
+ // sanitize
+ if (pm->epoch > get_epoch()) {
+ bootstrap();
+ pm->put();
+ break;
+ }
+ if (pm->epoch != get_epoch()) {
+ pm->put();
+ break;
+ }
- paxos->dispatch((PaxosServiceMessage*)m);
+ paxos->dispatch((PaxosServiceMessage*)m);
}
break;
- // elector messages
+ // elector messages
case MSG_MON_ELECTION:
//check privileges here for simplicity
if (s &&
- !s->is_capable("mon", MON_CAP_X)) {
- dout(0) << "MMonElection received from entity without enough caps!"
- << s->caps << dendl;
- m->put();
- break;
+ !s->is_capable("mon", MON_CAP_X)) {
+ dout(0) << "MMonElection received from entity without enough caps!"
+ << s->caps << dendl;
+ m->put();
+ break;
}
if (!is_probing() && !is_synchronizing()) {
- elector.dispatch(m);
+ elector.dispatch(m);
} else {
- m->put();
+ m->put();
}
break;
@@ -2769,10 +2799,6 @@ bool Monitor::_ms_dispatch(Message *m)
default:
ret = false;
- }
- }
- if (s) {
- s->put();
}
return ret;
diff --git a/src/mon/Monitor.h b/src/mon/Monitor.h
index df4a751361a..2c1c2cdeb19 100644
--- a/src/mon/Monitor.h
+++ b/src/mon/Monitor.h
@@ -700,6 +700,8 @@ public:
lock.Unlock();
return ret;
}
+ // dissociate message handling from session and connection logic
+ bool dispatch(MonSession *s, Message *m, const bool src_is_mon);
//mon_caps is used for un-connected messages from monitors
MonCap * mon_caps;
bool ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer, bool force_new);
@@ -844,5 +846,17 @@ public:
long parse_pos_long(const char *s, ostream *pss = NULL);
+struct MonCommand {
+ string cmdstring;
+ string helpstring;
+ string module;
+ string req_perms;
+ string availability;
+};
+
+void get_command_descriptions(const MonCommand *commands,
+ unsigned commands_size,
+ Formatter *f,
+ bufferlist *rdata);
#endif
diff --git a/src/mon/MonmapMonitor.cc b/src/mon/MonmapMonitor.cc
index 799f19df154..ca855592445 100644
--- a/src/mon/MonmapMonitor.cc
+++ b/src/mon/MonmapMonitor.cc
@@ -298,20 +298,45 @@ bool MonmapMonitor::prepare_command(MMonCommand *m)
addr.set_port(CEPH_MON_PORT);
}
- if (pending_map.contains(addr) ||
- pending_map.contains(name)) {
+ /**
+ * If we have a monitor with the same name and different addr, then EEXIST
+ * If we have a monitor with the same addr and different name, then EEXIST
+ * If we have a monitor with the same addr and same name, then return as if
+ * we had just added the monitor.
+ * If we don't have the monitor, add it.
+ */
+
+ err = 0;
+ if (!ss.str().empty())
+ ss << "; ";
+
+ do {
+ if (pending_map.contains(addr)) {
+ string n = pending_map.get_name(addr);
+ if (n == name)
+ break;
+ } else if (pending_map.contains(name)) {
+ entity_addr_t tmp_addr = pending_map.get_addr(name);
+ if (tmp_addr == addr)
+ break;
+ } else {
+ break;
+ }
err = -EEXIST;
- if (!ss.str().empty())
- ss << "; ";
- ss << "mon " << name << " " << addr << " already exists";
+ ss << "mon." << name << " at " << addr << " already exists";
+ goto out;
+ } while (false);
+
+ ss << "added mon." << name << " at " << addr;
+ if (pending_map.contains(name)) {
goto out;
}
pending_map.add(name, addr);
pending_map.last_changed = ceph_clock_now(g_ceph_context);
- ss << "added mon." << name << " at " << addr;
getline(ss, rs);
- wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs, get_last_committed()));
+ wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs,
+ get_last_committed()));
return true;
} else if (prefix == "mon remove") {
diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc
index df614eedb92..9144736d801 100644
--- a/src/mon/OSDMonitor.cc
+++ b/src/mon/OSDMonitor.cc
@@ -120,7 +120,12 @@ void OSDMonitor::update_from_paxos(bool *need_bootstrap)
* We will possibly have a stashed latest that *we* wrote, and we will
* always be sure to have the oldest full map in the first..last range
* due to encode_trim_extra(), which includes the oldest full map in the trim
- * transaction. Start with whichever is newer.
+ * transaction.
+ *
+ * encode_trim_extra() does not however write the full map's
+ * version to 'full_latest'. This is only done when we are building the
+ * full maps from the incremental versions. But don't panic! We make sure
+ * that the following conditions find whichever full map version is newer.
*/
version_t latest_full = get_version_latest_full();
if (latest_full == 0 && get_first_committed() > 1)
@@ -179,32 +184,49 @@ void OSDMonitor::update_from_paxos(bool *need_bootstrap)
}
// walk through incrementals
- MonitorDBStore::Transaction t;
+ MonitorDBStore::Transaction *t = NULL;
+ size_t tx_size = 0;
while (version > osdmap.epoch) {
bufferlist inc_bl;
int err = get_version(osdmap.epoch+1, inc_bl);
assert(err == 0);
assert(inc_bl.length());
-
+
dout(7) << "update_from_paxos applying incremental " << osdmap.epoch+1 << dendl;
OSDMap::Incremental inc(inc_bl);
err = osdmap.apply_incremental(inc);
assert(err == 0);
+ if (t == NULL)
+ t = new MonitorDBStore::Transaction;
+
// write out the full map for all past epochs
bufferlist full_bl;
osdmap.encode(full_bl);
- put_version_full(&t, osdmap.epoch, full_bl);
+ tx_size += full_bl.length();
+
+ put_version_full(t, osdmap.epoch, full_bl);
+ put_version_latest_full(t, osdmap.epoch);
// share
dout(1) << osdmap << dendl;
if (osdmap.epoch == 1) {
- t.erase("mkfs", "osdmap");
+ t->erase("mkfs", "osdmap");
+ }
+
+ if (tx_size > g_conf->mon_sync_max_payload_size*2) {
+ mon->store->apply_transaction(*t);
+ delete t;
+ t = NULL;
+ tx_size = 0;
}
}
- if (!t.empty())
- mon->store->apply_transaction(t);
+
+ if (t != NULL) {
+ mon->store->apply_transaction(*t);
+ delete t;
+ }
for (int o = 0; o < osdmap.get_max_osd(); o++) {
if (osdmap.is_down(o)) {
@@ -620,7 +642,6 @@ void OSDMonitor::encode_trim_extra(MonitorDBStore::Transaction *tx, version_t fi
bufferlist bl;
get_version_full(first, bl);
put_version_full(tx, first, bl);
- put_version_latest_full(tx, first);
}
// -------------
@@ -3001,7 +3022,7 @@ bool OSDMonitor::prepare_command(MMonCommand *m)
cmd_getval(g_ceph_context, cmdmap, "weight", w);
err = newcrush.adjust_item_weightf(g_ceph_context, id, w);
- if (err == 0) {
+ if (err >= 0) {
pending_inc.crush.clear();
newcrush.encode(pending_inc.crush);
ss << "reweighted item id " << id << " name '" << name << "' to " << w
diff --git a/src/mon/PGMap.cc b/src/mon/PGMap.cc
index e9a35c6b8ab..ea70bbd61c3 100644
--- a/src/mon/PGMap.cc
+++ b/src/mon/PGMap.cc
@@ -30,7 +30,7 @@ void PGMap::Incremental::encode(bufferlist &bl, uint64_t features) const
return;
}
- ENCODE_START(6, 5, bl);
+ ENCODE_START(7, 5, bl);
::encode(version, bl);
::encode(pg_stat_updates, bl);
::encode(osd_stat_updates, bl);
@@ -41,6 +41,7 @@ void PGMap::Incremental::encode(bufferlist &bl, uint64_t features) const
::encode(nearfull_ratio, bl);
::encode(pg_remove, bl);
::encode(stamp, bl);
+ ::encode(osd_epochs, bl);
ENCODE_FINISH(bl);
}
@@ -89,6 +90,17 @@ void PGMap::Incremental::decode(bufferlist::iterator &bl)
}
if (struct_v >= 6)
::decode(stamp, bl);
+ if (struct_v >= 7) {
+ ::decode(osd_epochs, bl);
+ } else {
+ for (map<int32_t, osd_stat_t>::iterator i = osd_stat_updates.begin();
+ i != osd_stat_updates.end();
+ ++i) {
+ // This isn't accurate, but will cause trimming to behave like
+ // previously.
+ osd_epochs.insert(make_pair(i->first, osdmap_epoch));
+ }
+ }
DECODE_FINISH(bl);
}
@@ -140,6 +152,7 @@ void PGMap::Incremental::generate_test_instances(list<PGMap::Incremental*>& o)
o.back()->version = 2;
o.back()->pg_stat_updates[pg_t(1,2,3)] = pg_stat_t();
o.back()->osd_stat_updates[5] = osd_stat_t();
+ o.back()->osd_epochs[5] = 12;
o.push_back(new Incremental);
o.back()->version = 3;
o.back()->osdmap_epoch = 1;
@@ -148,6 +161,7 @@ void PGMap::Incremental::generate_test_instances(list<PGMap::Incremental*>& o)
o.back()->nearfull_ratio = .3;
o.back()->pg_stat_updates[pg_t(4,5,6)] = pg_stat_t();
o.back()->osd_stat_updates[6] = osd_stat_t();
+ o.back()->osd_epochs[6] = 12;
o.back()->pg_remove.insert(pg_t(1,2,3));
o.back()->osd_stat_rm.insert(5);
}
@@ -195,8 +209,10 @@ void PGMap::apply_incremental(CephContext *cct, const Incremental& inc)
}
stat_pg_add(update_pg, update_stat);
}
- for (map<int32_t,osd_stat_t>::const_iterator p = inc.osd_stat_updates.begin();
- p != inc.osd_stat_updates.end();
+ assert(osd_stat.size() == osd_epochs.size());
+ for (map<int32_t,osd_stat_t>::const_iterator p =
+ inc.get_osd_stat_updates().begin();
+ p != inc.get_osd_stat_updates().end();
++p) {
int osd = p->first;
const osd_stat_t &new_stats(p->second);
@@ -209,6 +225,8 @@ void PGMap::apply_incremental(CephContext *cct, const Incremental& inc)
stat_osd_sub(t->second);
t->second = new_stats;
}
+ assert(inc.get_osd_epochs().find(osd) != inc.get_osd_epochs().end());
+ osd_epochs.insert(*(inc.get_osd_epochs().find(osd)));
stat_osd_add(new_stats);
@@ -226,8 +244,8 @@ void PGMap::apply_incremental(CephContext *cct, const Incremental& inc)
}
}
- for (set<int>::iterator p = inc.osd_stat_rm.begin();
- p != inc.osd_stat_rm.end();
+ for (set<int>::iterator p = inc.get_osd_stat_rm().begin();
+ p != inc.get_osd_stat_rm().end();
++p) {
hash_map<int32_t,osd_stat_t>::iterator t = osd_stat.find(*p);
if (t != osd_stat.end()) {
@@ -416,6 +434,14 @@ epoch_t PGMap::calc_min_last_epoch_clean() const
if (lec < min)
min = lec;
}
+ // also scan osd epochs
+ // don't trim past the oldest reported osd epoch
+ for (hash_map<int32_t, epoch_t>::const_iterator i = osd_epochs.begin();
+ i != osd_epochs.end();
+ ++i) {
+ if (i->second < min)
+ min = i->second;
+ }
return min;
}
@@ -434,7 +460,7 @@ void PGMap::encode(bufferlist &bl, uint64_t features) const
return;
}
- ENCODE_START(5, 4, bl);
+ ENCODE_START(6, 4, bl);
::encode(version, bl);
::encode(pg_stat, bl);
::encode(osd_stat, bl);
@@ -443,6 +469,7 @@ void PGMap::encode(bufferlist &bl, uint64_t features) const
::encode(full_ratio, bl);
::encode(nearfull_ratio, bl);
::encode(stamp, bl);
+ ::encode(osd_epochs, bl);
ENCODE_FINISH(bl);
}
@@ -472,6 +499,17 @@ void PGMap::decode(bufferlist::iterator &bl)
}
if (struct_v >= 5)
::decode(stamp, bl);
+ if (struct_v >= 6) {
+ ::decode(osd_epochs, bl);
+ } else {
+ for (hash_map<int32_t, osd_stat_t>::iterator i = osd_stat.begin();
+ i != osd_stat.end();
+ ++i) {
+ // This isn't accurate, but will cause trimming to behave like
+ // previously.
+ osd_epochs.insert(make_pair(i->first, last_osdmap_epoch));
+ }
+ }
DECODE_FINISH(bl);
calc_stats();
@@ -488,7 +526,10 @@ void PGMap::dirty_all(Incremental& inc)
inc.pg_stat_updates[p->first] = p->second;
}
for (hash_map<int32_t, osd_stat_t>::const_iterator p = osd_stat.begin(); p != osd_stat.end(); ++p) {
- inc.osd_stat_updates[p->first] = p->second;
+ assert(inc.get_osd_epochs().count(p->first));
+ inc.update_stat(p->first,
+ inc.get_osd_epochs().find(p->first)->second,
+ p->second);
}
}
@@ -701,7 +742,8 @@ void PGMap::dump_stuck_plain(ostream& ss, PGMap::StuckPG type, utime_t cutoff) c
{
hash_map<pg_t, pg_stat_t> stuck_pg_stats;
get_stuck_stats(type, cutoff, stuck_pg_stats);
- dump_pg_stats_plain(ss, stuck_pg_stats);
+ if (!stuck_pg_stats.empty())
+ dump_pg_stats_plain(ss, stuck_pg_stats);
}
void PGMap::dump_osd_perf_stats(Formatter *f) const
diff --git a/src/mon/PGMap.h b/src/mon/PGMap.h
index 84d89f87517..7a202fc0006 100644
--- a/src/mon/PGMap.h
+++ b/src/mon/PGMap.h
@@ -43,12 +43,13 @@ public:
float full_ratio;
float nearfull_ratio;
+ // mapping of osd to most recently reported osdmap epoch
+ hash_map<int32_t,epoch_t> osd_epochs;
+
class Incremental {
public:
version_t version;
map<pg_t,pg_stat_t> pg_stat_updates;
- map<int32_t,osd_stat_t> osd_stat_updates;
- set<int32_t> osd_stat_rm;
epoch_t osdmap_epoch;
epoch_t pg_scan; // osdmap epoch
set<pg_t> pg_remove;
@@ -56,6 +57,38 @@ public:
float nearfull_ratio;
utime_t stamp;
+ private:
+ map<int32_t,osd_stat_t> osd_stat_updates;
+ set<int32_t> osd_stat_rm;
+
+ // mapping of osd to most recently reported osdmap epoch
+ map<int32_t,epoch_t> osd_epochs;
+ public:
+
+ const map<int32_t, osd_stat_t> &get_osd_stat_updates() const {
+ return osd_stat_updates;
+ }
+ const set<int32_t> &get_osd_stat_rm() const {
+ return osd_stat_rm;
+ }
+ const map<int32_t, epoch_t> &get_osd_epochs() const {
+ return osd_epochs;
+ }
+
+ void update_stat(int32_t osd, epoch_t epoch, const osd_stat_t &stat) {
+ osd_stat_updates[osd] = stat;
+ osd_epochs[osd] = epoch;
+ assert(osd_epochs.size() == osd_stat_updates.size());
+ }
+ void stat_osd_out(int32_t osd) {
+ // 0 the stats for the osd
+ osd_stat_updates[osd] = osd_stat_t();
+ }
+ void rm_stat(int32_t osd) {
+ osd_stat_rm.insert(osd);
+ osd_epochs.erase(osd);
+ osd_stat_updates.erase(osd);
+ }
void encode(bufferlist &bl, uint64_t features=-1) const;
void decode(bufferlist::iterator &bl);
void dump(Formatter *f) const;
diff --git a/src/mon/PGMonitor.cc b/src/mon/PGMonitor.cc
index 2a677be61d9..0644922ddb4 100644
--- a/src/mon/PGMonitor.cc
+++ b/src/mon/PGMonitor.cc
@@ -494,15 +494,19 @@ void PGMonitor::encode_pending(MonitorDBStore::Transaction *t)
{
bufferlist dirty;
string prefix = pgmap_osd_prefix;
- for (map<int32_t,osd_stat_t>::const_iterator p = pending_inc.osd_stat_updates.begin();
- p != pending_inc.osd_stat_updates.end();
+ for (map<int32_t,osd_stat_t>::const_iterator p =
+ pending_inc.get_osd_stat_updates().begin();
+ p != pending_inc.get_osd_stat_updates().end();
++p) {
::encode(p->first, dirty);
bufferlist bl;
::encode(p->second, bl, features);
t->put(prefix, stringify(p->first), bl);
}
- for (set<int32_t>::const_iterator p = pending_inc.osd_stat_rm.begin(); p != pending_inc.osd_stat_rm.end(); ++p) {
+ for (set<int32_t>::const_iterator p =
+ pending_inc.get_osd_stat_rm().begin();
+ p != pending_inc.get_osd_stat_rm().end();
+ ++p) {
::encode(*p, dirty);
t->erase(prefix, stringify(*p));
}
@@ -725,7 +729,11 @@ bool PGMonitor::prepare_pg_stats(MPGStats *stats)
}
// osd stat
- pending_inc.osd_stat_updates[from] = stats->osd_stat;
+ if (mon->osdmon()->osdmap.is_in(from)) {
+ pending_inc.update_stat(from, stats->epoch, stats->osd_stat);
+ } else {
+ pending_inc.update_stat(from, stats->epoch, osd_stat_t());
+ }
if (pg_map.osd_stat.count(from))
dout(10) << " got osd." << from << " " << stats->osd_stat << " (was " << pg_map.osd_stat[from] << ")" << dendl;
@@ -842,11 +850,7 @@ void PGMonitor::check_osd_map(epoch_t epoch)
++p)
if (p->second == CEPH_OSD_OUT) {
dout(10) << "check_osd_map osd." << p->first << " went OUT" << dendl;
- pending_inc.osd_stat_rm.insert(p->first);
- } else {
- dout(10) << "check_osd_map osd." << p->first << " is IN" << dendl;
- pending_inc.osd_stat_rm.erase(p->first);
- pending_inc.osd_stat_updates[p->first];
+ pending_inc.stat_osd_out(p->first);
}
// this is conservative: we want to know if any osds (maybe) got marked down.
@@ -867,7 +871,7 @@ void PGMonitor::check_osd_map(epoch_t epoch)
// whether it was created *or* destroyed, we can safely drop
// it's osd_stat_t record.
dout(10) << "check_osd_map osd." << p->first << " created or destroyed" << dendl;
- pending_inc.osd_stat_rm.insert(p->first);
+ pending_inc.rm_stat(p->first);
// and adjust full, nearfull set
pg_map.nearfull_osds.erase(p->first);
@@ -1847,6 +1851,54 @@ void PGMonitor::get_health(list<pair<health_status_t,string> >& summary,
detail->push_back(make_pair(HEALTH_ERR, ss.str()));
}
}
+
+ // pg skew
+ int num_in = mon->osdmon()->osdmap.get_num_in_osds();
+ if (num_in && g_conf->mon_pg_warn_min_per_osd > 0) {
+ int per = pg_map.pg_stat.size() / num_in;
+ if (per < g_conf->mon_pg_warn_min_per_osd) {
+ ostringstream ss;
+ ss << "too few pgs per osd (" << per << " < min " << g_conf->mon_pg_warn_min_per_osd << ")";
+ summary.push_back(make_pair(HEALTH_WARN, ss.str()));
+ if (detail)
+ detail->push_back(make_pair(HEALTH_WARN, ss.str()));
+ }
+ }
+ if (!pg_map.pg_stat.empty()) {
+ for (hash_map<int,pool_stat_t>::const_iterator p = pg_map.pg_pool_sum.begin();
+ p != pg_map.pg_pool_sum.end();
+ ++p) {
+ const pg_pool_t *pi = mon->osdmon()->osdmap.get_pg_pool(p->first);
+ if (!pi)
+ continue; // in case osdmap changes haven't propagated to PGMap yet
+ if (pi->get_pg_num() > pi->get_pgp_num()) {
+ ostringstream ss;
+ ss << "pool " << mon->osdmon()->osdmap.get_pool_name(p->first) << " pg_num "
+ << pi->get_pg_num() << " > pgp_num " << pi->get_pgp_num();
+ summary.push_back(make_pair(HEALTH_WARN, ss.str()));
+ if (detail)
+ detail->push_back(make_pair(HEALTH_WARN, ss.str()));
+ }
+ int average_objects_per_pg = pg_map.pg_sum.stats.sum.num_objects / pg_map.pg_stat.size();
+ if (average_objects_per_pg > 0) {
+ int objects_per_pg = p->second.stats.sum.num_objects / pi->get_pg_num();
+ float ratio = (float)objects_per_pg / (float)average_objects_per_pg;
+ if (g_conf->mon_pg_warn_max_object_skew > 0 &&
+ ratio > g_conf->mon_pg_warn_max_object_skew) {
+ ostringstream ss;
+ ss << "pool " << mon->osdmon()->osdmap.get_pool_name(p->first) << " has too few pgs";
+ summary.push_back(make_pair(HEALTH_WARN, ss.str()));
+ if (detail) {
+ ostringstream ss;
+ ss << "pool " << mon->osdmon()->osdmap.get_pool_name(p->first) << " objects per pg ("
+ << objects_per_pg << ") is more than " << ratio << " times cluster average ("
+ << average_objects_per_pg << ")";
+ detail->push_back(make_pair(HEALTH_WARN, ss.str()));
+ }
+ }
+ }
+ }
+ }
}
void PGMonitor::check_full_osd_health(list<pair<health_status_t,string> >& summary,
diff --git a/src/mon/PGMonitor.h b/src/mon/PGMonitor.h
index 44015395e94..d29f47c1c43 100644
--- a/src/mon/PGMonitor.h
+++ b/src/mon/PGMonitor.h
@@ -28,6 +28,7 @@ using namespace std;
#include "PaxosService.h"
#include "include/types.h"
#include "include/utime.h"
+#include "include/histogram.h"
#include "msg/Messenger.h"
#include "common/config.h"
#include "mon/MonitorDBStore.h"
diff --git a/src/msg/Pipe.cc b/src/msg/Pipe.cc
index 50656fee53b..66b64d0097a 100644
--- a/src/msg/Pipe.cc
+++ b/src/msg/Pipe.cc
@@ -1136,6 +1136,19 @@ void Pipe::unregister_pipe()
}
}
+void Pipe::join()
+{
+ ldout(msgr->cct, 20) << "join" << dendl;
+ if (writer_thread.is_started())
+ writer_thread.join();
+ if (reader_thread.is_started())
+ reader_thread.join();
+ if (delay_thread) {
+ ldout(msgr->cct, 20) << "joining delay_thread" << dendl;
+ delay_thread->stop();
+ delay_thread->join();
+ }
+}
void Pipe::requeue_sent()
{
diff --git a/src/msg/Pipe.h b/src/msg/Pipe.h
index 5f94305350c..6c91395a352 100644
--- a/src/msg/Pipe.h
+++ b/src/msg/Pipe.h
@@ -234,16 +234,7 @@ class DispatchQueue;
void register_pipe();
void unregister_pipe();
- void join() {
- if (writer_thread.is_started())
- writer_thread.join();
- if (reader_thread.is_started())
- reader_thread.join();
- if (delay_thread) {
- delay_thread->stop();
- delay_thread->join();
- }
- }
+ void join();
void stop();
void _send(Message *m) {
diff --git a/src/msg/msg_types.cc b/src/msg/msg_types.cc
index 38416abd4f2..b02db768bfb 100644
--- a/src/msg/msg_types.cc
+++ b/src/msg/msg_types.cc
@@ -135,7 +135,7 @@ bool entity_addr_t::parse(const char *s, const char **end)
ostream& operator<<(ostream& out, const sockaddr_storage &ss)
{
char buf[NI_MAXHOST] = { 0 };
- char serv[20] = { 0 };
+ char serv[NI_MAXSERV] = { 0 };
size_t hostlen;
if (ss.ss_family == AF_INET)
diff --git a/src/objclass/class_api.cc b/src/objclass/class_api.cc
index 1ac224cdfe7..bb26c752f9b 100644
--- a/src/objclass/class_api.cc
+++ b/src/objclass/class_api.cc
@@ -177,7 +177,7 @@ int cls_read(cls_method_context_t hctx, int ofs, int len,
int cls_get_request_origin(cls_method_context_t hctx, entity_inst_t *origin)
{
ReplicatedPG::OpContext **pctx = static_cast<ReplicatedPG::OpContext **>(hctx);
- *origin = (*pctx)->op->request->get_orig_source_inst();
+ *origin = (*pctx)->op->get_req()->get_orig_source_inst();
return 0;
}
diff --git a/src/objsync/boto_del.py b/src/objsync/boto_del.py
index 14e790544ec..ba512e1ca33 100755
--- a/src/objsync/boto_del.py
+++ b/src/objsync/boto_del.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
#
# Ceph - scalable distributed file system
diff --git a/src/os/CollectionIndex.h b/src/os/CollectionIndex.h
index 9b1ceae8c46..89b7b862632 100644
--- a/src/os/CollectionIndex.h
+++ b/src/os/CollectionIndex.h
@@ -23,7 +23,7 @@
#include "include/object.h"
/**
- * CollectionIndex provides an interface for manipulating indexed colelctions
+ * CollectionIndex provides an interface for manipulating indexed collections
*/
class CollectionIndex {
protected:
@@ -127,26 +127,26 @@ protected:
* @return Error Code, 0 for success
*/
virtual int created(
- const hobject_t &hoid, ///< [in] Created object.
+ const ghobject_t &oid, ///< [in] Created object.
const char *path ///< [in] Path to created object.
) = 0;
/**
- * Removes hoid from the collection
+ * Removes oid from the collection
*
* @return Error Code, 0 for success
*/
virtual int unlink(
- const hobject_t &hoid ///< [in] Object to remove
+ const ghobject_t &oid ///< [in] Object to remove
) = 0;
/**
- * Gets the IndexedPath for hoid.
+ * Gets the IndexedPath for oid.
*
* @return Error Code, 0 for success
*/
virtual int lookup(
- const hobject_t &hoid, ///< [in] Object to lookup
+ const ghobject_t &oid, ///< [in] Object to lookup
IndexedPath *path, ///< [out] Path to object
int *exist ///< [out] True if the object exists, else false
) = 0;
@@ -167,17 +167,17 @@ protected:
/// List contents of collection by hash
virtual int collection_list_partial(
- const hobject_t &start, ///< [in] object at which to start
+ const ghobject_t &start, ///< [in] object at which to start
int min_count, ///< [in] get at least min_count objects
int max_count, ///< [in] return at most max_count objects
snapid_t seq, ///< [in] list only objects with snap >= seq
- vector<hobject_t> *ls, ///< [out] Listed objects
- hobject_t *next ///< [out] Next object to list
+ vector<ghobject_t> *ls, ///< [out] Listed objects
+ ghobject_t *next ///< [out] Next object to list
) = 0;
/// List contents of collection.
virtual int collection_list(
- vector<hobject_t> *ls ///< [out] Listed Objects
+ vector<ghobject_t> *ls ///< [out] Listed Objects
) = 0;
/// Call prior to removing directory
diff --git a/src/os/DBObjectMap.cc b/src/os/DBObjectMap.cc
index 90c840bbe9c..635870b0db5 100644
--- a/src/os/DBObjectMap.cc
+++ b/src/os/DBObjectMap.cc
@@ -130,61 +130,68 @@ bool DBObjectMap::check(std::ostream &out)
return retval;
}
-string DBObjectMap::hobject_key(const hobject_t &hoid)
+string DBObjectMap::ghobject_key(const ghobject_t &oid)
{
string out;
- append_escaped(hoid.oid.name, &out);
+ append_escaped(oid.hobj.oid.name, &out);
out.push_back('.');
- append_escaped(hoid.get_key(), &out);
+ append_escaped(oid.hobj.get_key(), &out);
out.push_back('.');
- append_escaped(hoid.nspace, &out);
+ append_escaped(oid.hobj.nspace, &out);
out.push_back('.');
char snap_with_hash[1000];
char *t = snap_with_hash;
char *end = t + sizeof(snap_with_hash);
- if (hoid.snap == CEPH_NOSNAP)
+ if (oid.hobj.snap == CEPH_NOSNAP)
t += snprintf(t, end - t, "head");
- else if (hoid.snap == CEPH_SNAPDIR)
+ else if (oid.hobj.snap == CEPH_SNAPDIR)
t += snprintf(t, end - t, "snapdir");
else
- t += snprintf(t, end - t, "%llx", (long long unsigned)hoid.snap);
+ t += snprintf(t, end - t, "%llx", (long long unsigned)oid.hobj.snap);
- if (hoid.pool == -1)
+ if (oid.hobj.pool == -1)
t += snprintf(t, end - t, ".none");
else
- t += snprintf(t, end - t, ".%llx", (long long unsigned)hoid.pool);
- snprintf(t, end - t, ".%.*X", (int)(sizeof(hoid.hash)*2), hoid.hash);
+ t += snprintf(t, end - t, ".%llx", (long long unsigned)oid.hobj.pool);
+ snprintf(t, end - t, ".%.*X", (int)(sizeof(oid.hobj.hash)*2), oid.hobj.hash);
+
+ if (oid.generation != ghobject_t::NO_GEN) {
+ assert(oid.shard_id != ghobject_t::NO_SHARD);
+
+ t += snprintf(t, end - t, ".%llx", (long long unsigned)oid.generation);
+ t += snprintf(t, end - t, ".%x", (int)oid.shard_id);
+ }
out += string(snap_with_hash);
return out;
}
-string DBObjectMap::hobject_key_v0(coll_t c, const hobject_t &hoid)
+string DBObjectMap::ghobject_key_v0(coll_t c, const ghobject_t &oid)
{
string out;
append_escaped(c.to_str(), &out);
out.push_back('.');
- append_escaped(hoid.oid.name, &out);
+ append_escaped(oid.hobj.oid.name, &out);
out.push_back('.');
- append_escaped(hoid.get_key(), &out);
+ append_escaped(oid.hobj.get_key(), &out);
out.push_back('.');
char snap_with_hash[1000];
char *t = snap_with_hash;
char *end = t + sizeof(snap_with_hash);
- if (hoid.snap == CEPH_NOSNAP)
+ if (oid.hobj.snap == CEPH_NOSNAP)
t += snprintf(t, end - t, ".head");
- else if (hoid.snap == CEPH_SNAPDIR)
+ else if (oid.hobj.snap == CEPH_SNAPDIR)
t += snprintf(t, end - t, ".snapdir");
else
- t += snprintf(t, end - t, ".%llx", (long long unsigned)hoid.snap);
- snprintf(t, end - t, ".%.*X", (int)(sizeof(hoid.hash)*2), hoid.hash);
+ t += snprintf(t, end - t, ".%llx", (long long unsigned)oid.hobj.snap);
+ snprintf(t, end - t, ".%.*X", (int)(sizeof(oid.hobj.hash)*2), oid.hobj.hash);
out += string(snap_with_hash);
return out;
}
-bool DBObjectMap::parse_hobject_key_v0(const string &in, coll_t *c,
- hobject_t *hoid)
+bool DBObjectMap::parse_ghobject_key_v0(const string &in, coll_t *c,
+ ghobject_t *oid)
{
string coll;
string name;
@@ -244,13 +251,13 @@ bool DBObjectMap::parse_hobject_key_v0(const string &in, coll_t *c,
pg_t pg;
if (c->is_pg_prefix(pg))
pool = (int64_t)pg.pool();
- (*hoid) = hobject_t(name, key, snap, hash, pool, "");
+ (*oid) = ghobject_t(hobject_t(name, key, snap, hash, pool, ""));
return true;
}
-string DBObjectMap::map_header_key(const hobject_t &hoid)
+string DBObjectMap::map_header_key(const ghobject_t &oid)
{
- return hobject_key(hoid);
+ return ghobject_key(oid);
}
string DBObjectMap::header_key(uint64_t seq)
@@ -311,9 +318,9 @@ int DBObjectMap::DBObjectMapIteratorImpl::init()
}
ObjectMap::ObjectMapIterator DBObjectMap::get_iterator(
- const hobject_t &hoid)
+ const ghobject_t &oid)
{
- Header header = lookup_map_header(hoid);
+ Header header = lookup_map_header(oid);
if (!header)
return ObjectMapIterator(new EmptyIteratorImpl());
return _get_iterator(header);
@@ -496,15 +503,15 @@ int DBObjectMap::DBObjectMapIteratorImpl::status()
return r;
}
-int DBObjectMap::set_keys(const hobject_t &hoid,
+int DBObjectMap::set_keys(const ghobject_t &oid,
const map<string, bufferlist> &set,
const SequencerPosition *spos)
{
KeyValueDB::Transaction t = db->get_transaction();
- Header header = lookup_create_map_header(hoid, t);
+ Header header = lookup_create_map_header(oid, t);
if (!header)
return -EINVAL;
- if (check_spos(hoid, header, spos))
+ if (check_spos(oid, header, spos))
return 0;
t->set(user_prefix(header), set);
@@ -512,15 +519,15 @@ int DBObjectMap::set_keys(const hobject_t &hoid,
return db->submit_transaction(t);
}
-int DBObjectMap::set_header(const hobject_t &hoid,
+int DBObjectMap::set_header(const ghobject_t &oid,
const bufferlist &bl,
const SequencerPosition *spos)
{
KeyValueDB::Transaction t = db->get_transaction();
- Header header = lookup_create_map_header(hoid, t);
+ Header header = lookup_create_map_header(oid, t);
if (!header)
return -EINVAL;
- if (check_spos(hoid, header, spos))
+ if (check_spos(oid, header, spos))
return 0;
_set_header(header, bl, t);
return db->submit_transaction(t);
@@ -534,10 +541,10 @@ void DBObjectMap::_set_header(Header header, const bufferlist &bl,
t->set(sys_prefix(header), to_set);
}
-int DBObjectMap::get_header(const hobject_t &hoid,
+int DBObjectMap::get_header(const ghobject_t &oid,
bufferlist *bl)
{
- Header header = lookup_map_header(hoid);
+ Header header = lookup_map_header(oid);
if (!header) {
return 0;
}
@@ -568,16 +575,16 @@ int DBObjectMap::_get_header(Header header,
return 0;
}
-int DBObjectMap::clear(const hobject_t &hoid,
+int DBObjectMap::clear(const ghobject_t &oid,
const SequencerPosition *spos)
{
KeyValueDB::Transaction t = db->get_transaction();
- Header header = lookup_map_header(hoid);
+ Header header = lookup_map_header(oid);
if (!header)
return -ENOENT;
- if (check_spos(hoid, header, spos))
+ if (check_spos(oid, header, spos))
return 0;
- remove_map_header(hoid, header, t);
+ remove_map_header(oid, header, t);
assert(header->num_children > 0);
header->num_children--;
int r = _clear(header, t);
@@ -688,15 +695,15 @@ int DBObjectMap::need_parent(DBObjectMapIterator iter)
return 1;
}
-int DBObjectMap::rm_keys(const hobject_t &hoid,
+int DBObjectMap::rm_keys(const ghobject_t &oid,
const set<string> &to_clear,
const SequencerPosition *spos)
{
- Header header = lookup_map_header(hoid);
+ Header header = lookup_map_header(oid);
if (!header)
return -ENOENT;
KeyValueDB::Transaction t = db->get_transaction();
- if (check_spos(hoid, header, spos))
+ if (check_spos(oid, header, spos))
return 0;
t->rmkeys(user_prefix(header), to_clear);
if (!header->parent) {
@@ -756,17 +763,17 @@ int DBObjectMap::rm_keys(const hobject_t &hoid,
parent->num_children--;
_clear(parent, t);
header->parent = 0;
- set_map_header(hoid, *header, t);
+ set_map_header(oid, *header, t);
t->rmkeys_by_prefix(complete_prefix(header));
}
return db->submit_transaction(t);
}
-int DBObjectMap::get(const hobject_t &hoid,
+int DBObjectMap::get(const ghobject_t &oid,
bufferlist *_header,
map<string, bufferlist> *out)
{
- Header header = lookup_map_header(hoid);
+ Header header = lookup_map_header(oid);
if (!header)
return -ENOENT;
_get_header(header, _header);
@@ -779,13 +786,13 @@ int DBObjectMap::get(const hobject_t &hoid,
return 0;
}
-int DBObjectMap::get_keys(const hobject_t &hoid,
+int DBObjectMap::get_keys(const ghobject_t &oid,
set<string> *keys)
{
- Header header = lookup_map_header(hoid);
+ Header header = lookup_map_header(oid);
if (!header)
return -ENOENT;
- ObjectMapIterator iter = get_iterator(hoid);
+ ObjectMapIterator iter = get_iterator(oid);
for (; iter->valid(); iter->next()) {
if (iter->status())
return iter->status();
@@ -816,40 +823,40 @@ int DBObjectMap::scan(Header header,
return 0;
}
-int DBObjectMap::get_values(const hobject_t &hoid,
+int DBObjectMap::get_values(const ghobject_t &oid,
const set<string> &keys,
map<string, bufferlist> *out)
{
- Header header = lookup_map_header(hoid);
+ Header header = lookup_map_header(oid);
if (!header)
return -ENOENT;
return scan(header, keys, 0, out);
}
-int DBObjectMap::check_keys(const hobject_t &hoid,
+int DBObjectMap::check_keys(const ghobject_t &oid,
const set<string> &keys,
set<string> *out)
{
- Header header = lookup_map_header(hoid);
+ Header header = lookup_map_header(oid);
if (!header)
return -ENOENT;
return scan(header, keys, out, 0);
}
-int DBObjectMap::get_xattrs(const hobject_t &hoid,
+int DBObjectMap::get_xattrs(const ghobject_t &oid,
const set<string> &to_get,
map<string, bufferlist> *out)
{
- Header header = lookup_map_header(hoid);
+ Header header = lookup_map_header(oid);
if (!header)
return -ENOENT;
return db->get(xattr_prefix(header), to_get, out);
}
-int DBObjectMap::get_all_xattrs(const hobject_t &hoid,
+int DBObjectMap::get_all_xattrs(const ghobject_t &oid,
set<string> *out)
{
- Header header = lookup_map_header(hoid);
+ Header header = lookup_map_header(oid);
if (!header)
return -ENOENT;
KeyValueDB::Iterator iter = db->get_iterator(xattr_prefix(header));
@@ -860,39 +867,39 @@ int DBObjectMap::get_all_xattrs(const hobject_t &hoid,
return iter->status();
}
-int DBObjectMap::set_xattrs(const hobject_t &hoid,
+int DBObjectMap::set_xattrs(const ghobject_t &oid,
const map<string, bufferlist> &to_set,
const SequencerPosition *spos)
{
KeyValueDB::Transaction t = db->get_transaction();
- Header header = lookup_create_map_header(hoid, t);
+ Header header = lookup_create_map_header(oid, t);
if (!header)
return -EINVAL;
- if (check_spos(hoid, header, spos))
+ if (check_spos(oid, header, spos))
return 0;
t->set(xattr_prefix(header), to_set);
return db->submit_transaction(t);
}
-int DBObjectMap::remove_xattrs(const hobject_t &hoid,
+int DBObjectMap::remove_xattrs(const ghobject_t &oid,
const set<string> &to_remove,
const SequencerPosition *spos)
{
KeyValueDB::Transaction t = db->get_transaction();
- Header header = lookup_map_header(hoid);
+ Header header = lookup_map_header(oid);
if (!header)
return -ENOENT;
- if (check_spos(hoid, header, spos))
+ if (check_spos(oid, header, spos))
return 0;
t->rmkeys(xattr_prefix(header), to_remove);
return db->submit_transaction(t);
}
-int DBObjectMap::clone(const hobject_t &hoid,
- const hobject_t &target,
+int DBObjectMap::clone(const ghobject_t &oid,
+ const ghobject_t &target,
const SequencerPosition *spos)
{
- if (hoid == target)
+ if (oid == target)
return 0;
KeyValueDB::Transaction t = db->get_transaction();
@@ -907,18 +914,18 @@ int DBObjectMap::clone(const hobject_t &hoid,
}
}
- Header parent = lookup_map_header(hoid);
+ Header parent = lookup_map_header(oid);
if (!parent)
return db->submit_transaction(t);
- Header source = generate_new_header(hoid, parent);
+ Header source = generate_new_header(oid, parent);
Header destination = generate_new_header(target, parent);
if (spos)
destination->spos = *spos;
parent->num_children = 2;
set_header(parent, t);
- set_map_header(hoid, *source, t);
+ set_map_header(oid, *source, t);
set_map_header(target, *destination, t);
map<string, bufferlist> to_set;
@@ -973,9 +980,9 @@ int DBObjectMap::upgrade()
to_get);
coll_t coll;
- hobject_t hoid;
- assert(parse_hobject_key_v0(iter->key(), &coll, &hoid));
- new_map_headers[hobject_key(hoid)] = got.begin()->second;
+ ghobject_t oid;
+ assert(parse_ghobject_key_v0(iter->key(), &coll, &oid));
+ new_map_headers[ghobject_key(oid)] = got.begin()->second;
}
t->rmkeys(LEAF_PREFIX, legacy_to_remove);
@@ -1038,18 +1045,18 @@ int DBObjectMap::init(bool do_upgrade)
return 0;
}
-int DBObjectMap::sync(const hobject_t *hoid,
+int DBObjectMap::sync(const ghobject_t *oid,
const SequencerPosition *spos) {
KeyValueDB::Transaction t = db->get_transaction();
write_state(t);
- if (hoid) {
+ if (oid) {
assert(spos);
- Header header = lookup_map_header(*hoid);
+ Header header = lookup_map_header(*oid);
if (header) {
- dout(10) << "hoid: " << *hoid << " setting spos to "
+ dout(10) << "oid: " << *oid << " setting spos to "
<< *spos << dendl;
header->spos = *spos;
- set_map_header(*hoid, *header, t);
+ set_map_header(*oid, *header, t);
}
}
return db->submit_transaction_sync(t);
@@ -1067,27 +1074,27 @@ int DBObjectMap::write_state(KeyValueDB::Transaction _t) {
}
-DBObjectMap::Header DBObjectMap::_lookup_map_header(const hobject_t &hoid)
+DBObjectMap::Header DBObjectMap::_lookup_map_header(const ghobject_t &oid)
{
- while (map_header_in_use.count(hoid))
+ while (map_header_in_use.count(oid))
header_cond.Wait(header_lock);
map<string, bufferlist> out;
set<string> to_get;
- to_get.insert(map_header_key(hoid));
+ to_get.insert(map_header_key(oid));
int r = db->get(HOBJECT_TO_SEQ, to_get, &out);
if (r < 0)
return Header();
if (out.empty())
return Header();
- Header ret(new _Header(), RemoveMapHeaderOnDelete(this, hoid));
+ Header ret(new _Header(), RemoveMapHeaderOnDelete(this, oid));
bufferlist::iterator iter = out.begin()->second.begin();
ret->decode(iter);
return ret;
}
-DBObjectMap::Header DBObjectMap::_generate_new_header(const hobject_t &hoid,
+DBObjectMap::Header DBObjectMap::_generate_new_header(const ghobject_t &oid,
Header parent)
{
Header header = Header(new _Header(), RemoveOnDelete(this));
@@ -1097,7 +1104,7 @@ DBObjectMap::Header DBObjectMap::_generate_new_header(const hobject_t &hoid,
header->spos = parent->spos;
}
header->num_children = 1;
- header->hoid = hoid;
+ header->oid = oid;
assert(!in_use.count(header->seq));
in_use.insert(header->seq);
@@ -1137,14 +1144,14 @@ DBObjectMap::Header DBObjectMap::lookup_parent(Header input)
}
DBObjectMap::Header DBObjectMap::lookup_create_map_header(
- const hobject_t &hoid,
+ const ghobject_t &oid,
KeyValueDB::Transaction t)
{
Mutex::Locker l(header_lock);
- Header header = _lookup_map_header(hoid);
+ Header header = _lookup_map_header(oid);
if (!header) {
- header = _generate_new_header(hoid, Header());
- set_map_header(hoid, *header, t);
+ header = _generate_new_header(oid, Header());
+ set_map_header(oid, *header, t);
}
return header;
}
@@ -1169,50 +1176,50 @@ void DBObjectMap::set_header(Header header, KeyValueDB::Transaction t)
t->set(sys_prefix(header), to_write);
}
-void DBObjectMap::remove_map_header(const hobject_t &hoid,
+void DBObjectMap::remove_map_header(const ghobject_t &oid,
Header header,
KeyValueDB::Transaction t)
{
dout(20) << "remove_map_header: removing " << header->seq
- << " hoid " << hoid << dendl;
+ << " oid " << oid << dendl;
set<string> to_remove;
- to_remove.insert(map_header_key(hoid));
+ to_remove.insert(map_header_key(oid));
t->rmkeys(HOBJECT_TO_SEQ, to_remove);
}
-void DBObjectMap::set_map_header(const hobject_t &hoid, _Header header,
+void DBObjectMap::set_map_header(const ghobject_t &oid, _Header header,
KeyValueDB::Transaction t)
{
dout(20) << "set_map_header: setting " << header.seq
- << " hoid " << hoid << " parent seq "
+ << " oid " << oid << " parent seq "
<< header.parent << dendl;
map<string, bufferlist> to_set;
- header.encode(to_set[map_header_key(hoid)]);
+ header.encode(to_set[map_header_key(oid)]);
t->set(HOBJECT_TO_SEQ, to_set);
}
-bool DBObjectMap::check_spos(const hobject_t &hoid,
+bool DBObjectMap::check_spos(const ghobject_t &oid,
Header header,
const SequencerPosition *spos)
{
if (!spos || *spos > header->spos) {
stringstream out;
if (spos)
- dout(10) << "hoid: " << hoid << " not skipping op, *spos "
+ dout(10) << "oid: " << oid << " not skipping op, *spos "
<< *spos << dendl;
else
- dout(10) << "hoid: " << hoid << " not skipping op, *spos "
+ dout(10) << "oid: " << oid << " not skipping op, *spos "
<< "empty" << dendl;
dout(10) << " > header.spos " << header->spos << dendl;
return false;
} else {
- dout(10) << "hoid: " << hoid << " skipping op, *spos " << *spos
+ dout(10) << "oid: " << oid << " skipping op, *spos " << *spos
<< " <= header.spos " << header->spos << dendl;
return true;
}
}
-int DBObjectMap::list_objects(vector<hobject_t> *out)
+int DBObjectMap::list_objects(vector<ghobject_t> *out)
{
KeyValueDB::Iterator iter = db->get_iterator(HOBJECT_TO_SEQ);
for (iter->seek_to_first(); iter->valid(); iter->next()) {
@@ -1220,7 +1227,7 @@ int DBObjectMap::list_objects(vector<hobject_t> *out)
bufferlist::iterator bliter = bl.begin();
_Header header;
header.decode(bliter);
- out->push_back(header.hoid);
+ out->push_back(header.oid);
}
return 0;
}
diff --git a/src/os/DBObjectMap.h b/src/os/DBObjectMap.h
index ba05dff6c6f..459447f9c97 100644
--- a/src/os/DBObjectMap.h
+++ b/src/os/DBObjectMap.h
@@ -26,7 +26,7 @@
* @see user_prefix
* @see sys_prefix
*
- * - HOBJECT_TO_SEQ: Contains leaf mapping from hobject_t->seq and
+ * - GHOBJECT_TO_SEQ: Contains leaf mapping from ghobject_t->hobj.seq and
* corresponding omap header
* - SYS_PREFIX: GLOBAL_STATE_KEY - contains next seq number
* @see State
@@ -66,89 +66,89 @@ public:
* Set of headers currently in use
*/
set<uint64_t> in_use;
- set<hobject_t> map_header_in_use;
+ set<ghobject_t> map_header_in_use;
DBObjectMap(KeyValueDB *db) : db(db),
header_lock("DBOBjectMap")
{}
int set_keys(
- const hobject_t &hoid,
+ const ghobject_t &oid,
const map<string, bufferlist> &set,
const SequencerPosition *spos=0
);
int set_header(
- const hobject_t &hoid,
+ const ghobject_t &oid,
const bufferlist &bl,
const SequencerPosition *spos=0
);
int get_header(
- const hobject_t &hoid,
+ const ghobject_t &oid,
bufferlist *bl
);
int clear(
- const hobject_t &hoid,
+ const ghobject_t &oid,
const SequencerPosition *spos=0
);
int rm_keys(
- const hobject_t &hoid,
+ const ghobject_t &oid,
const set<string> &to_clear,
const SequencerPosition *spos=0
);
int get(
- const hobject_t &hoid,
+ const ghobject_t &oid,
bufferlist *header,
map<string, bufferlist> *out
);
int get_keys(
- const hobject_t &hoid,
+ const ghobject_t &oid,
set<string> *keys
);
int get_values(
- const hobject_t &hoid,
+ const ghobject_t &oid,
const set<string> &keys,
map<string, bufferlist> *out
);
int check_keys(
- const hobject_t &hoid,
+ const ghobject_t &oid,
const set<string> &keys,
set<string> *out
);
int get_xattrs(
- const hobject_t &hoid,
+ const ghobject_t &oid,
const set<string> &to_get,
map<string, bufferlist> *out
);
int get_all_xattrs(
- const hobject_t &hoid,
+ const ghobject_t &oid,
set<string> *out
);
int set_xattrs(
- const hobject_t &hoid,
+ const ghobject_t &oid,
const map<string, bufferlist> &to_set,
const SequencerPosition *spos=0
);
int remove_xattrs(
- const hobject_t &hoid,
+ const ghobject_t &oid,
const set<string> &to_remove,
const SequencerPosition *spos=0
);
int clone(
- const hobject_t &hoid,
- const hobject_t &target,
+ const ghobject_t &oid,
+ const ghobject_t &target,
const SequencerPosition *spos=0
);
@@ -162,13 +162,13 @@ public:
bool check(std::ostream &out);
/// Ensure that all previous operations are durable
- int sync(const hobject_t *hoid=0, const SequencerPosition *spos=0);
+ int sync(const ghobject_t *oid=0, const SequencerPosition *spos=0);
/// Util, list all objects, there must be no other concurrent access
- int list_objects(vector<hobject_t> *objs ///< [out] objects
+ int list_objects(vector<ghobject_t> *objs ///< [out] objects
);
- ObjectMapIterator get_iterator(const hobject_t &hoid);
+ ObjectMapIterator get_iterator(const ghobject_t &oid);
static const string USER_PREFIX;
static const string XATTR_PREFIX;
@@ -223,7 +223,7 @@ public:
uint64_t num_children;
coll_t c;
- hobject_t hoid;
+ ghobject_t oid;
SequencerPosition spos;
@@ -233,7 +233,7 @@ public:
::encode(parent, bl);
::encode(num_children, bl);
::encode(c, bl);
- ::encode(hoid, bl);
+ ::encode(oid, bl);
::encode(spos, bl);
ENCODE_FINISH(bl);
}
@@ -244,7 +244,7 @@ public:
::decode(parent, bl);
::decode(num_children, bl);
::decode(c, bl);
- ::decode(hoid, bl);
+ ::decode(oid, bl);
if (struct_v >= 2)
::decode(spos, bl);
DECODE_FINISH(bl);
@@ -255,7 +255,7 @@ public:
f->dump_unsigned("parent", parent);
f->dump_unsigned("num_children", num_children);
f->dump_stream("coll") << c;
- f->dump_stream("oid") << hoid;
+ f->dump_stream("oid") << oid;
}
static void generate_test_instances(list<_Header*> &o) {
@@ -269,15 +269,15 @@ public:
};
/// String munging (public for testing)
- static string hobject_key(const hobject_t &hoid);
- static string hobject_key_v0(coll_t c, const hobject_t &hoid);
- static bool parse_hobject_key_v0(const string &in,
- coll_t *c, hobject_t *hoid);
+ static string ghobject_key(const ghobject_t &oid);
+ static string ghobject_key_v0(coll_t c, const ghobject_t &oid);
+ static bool parse_ghobject_key_v0(const string &in,
+ coll_t *c, ghobject_t *oid);
private:
/// Implicit lock on Header->seq
typedef std::tr1::shared_ptr<_Header> Header;
- string map_header_key(const hobject_t &hoid);
+ string map_header_key(const ghobject_t &oid);
string header_key(uint64_t seq);
string complete_prefix(Header header);
string user_prefix(Header header);
@@ -368,40 +368,40 @@ private:
/// Set node containing input to new contents
void set_header(Header input, KeyValueDB::Transaction t);
- /// Remove leaf node corresponding to hoid in c
- void remove_map_header(const hobject_t &hoid,
+ /// Remove leaf node corresponding to oid in c
+ void remove_map_header(const ghobject_t &oid,
Header header,
KeyValueDB::Transaction t);
- /// Set leaf node for c and hoid to the value of header
- void set_map_header(const hobject_t &hoid, _Header header,
+ /// Set leaf node for c and oid to the value of header
+ void set_map_header(const ghobject_t &oid, _Header header,
KeyValueDB::Transaction t);
- /// Set leaf node for c and hoid to the value of header
- bool check_spos(const hobject_t &hoid,
+ /// Set leaf node for c and oid to the value of header
+ bool check_spos(const ghobject_t &oid,
Header header,
const SequencerPosition *spos);
- /// Lookup or create header for c hoid
- Header lookup_create_map_header(const hobject_t &hoid,
+ /// Lookup or create header for c oid
+ Header lookup_create_map_header(const ghobject_t &oid,
KeyValueDB::Transaction t);
/**
- * Generate new header for c hoid with new seq number
+ * Generate new header for c oid with new seq number
*
* Has the side effect of syncronously saving the new DBObjectMap state
*/
- Header _generate_new_header(const hobject_t &hoid, Header parent);
- Header generate_new_header(const hobject_t &hoid, Header parent) {
+ Header _generate_new_header(const ghobject_t &oid, Header parent);
+ Header generate_new_header(const ghobject_t &oid, Header parent) {
Mutex::Locker l(header_lock);
- return _generate_new_header(hoid, parent);
+ return _generate_new_header(oid, parent);
}
- /// Lookup leaf header for c hoid
- Header _lookup_map_header(const hobject_t &hoid);
- Header lookup_map_header(const hobject_t &hoid) {
+ /// Lookup leaf header for c oid
+ Header _lookup_map_header(const ghobject_t &oid);
+ Header lookup_map_header(const ghobject_t &oid) {
Mutex::Locker l(header_lock);
- return _lookup_map_header(hoid);
+ return _lookup_map_header(oid);
}
/// Lookup header node for input
@@ -448,12 +448,12 @@ private:
class RemoveMapHeaderOnDelete {
public:
DBObjectMap *db;
- hobject_t obj;
- RemoveMapHeaderOnDelete(DBObjectMap *db, const hobject_t &obj) :
- db(db), obj(obj) {}
+ ghobject_t oid;
+ RemoveMapHeaderOnDelete(DBObjectMap *db, const ghobject_t &oid) :
+ db(db), oid(oid) {}
void operator() (_Header *header) {
Mutex::Locker l(db->header_lock);
- db->map_header_in_use.erase(obj);
+ db->map_header_in_use.erase(oid);
db->map_header_cond.Signal();
delete header;
}
diff --git a/src/os/FDCache.h b/src/os/FDCache.h
index 00e632f3e0f..93557d43c47 100644
--- a/src/os/FDCache.h
+++ b/src/os/FDCache.h
@@ -49,7 +49,7 @@ public:
};
private:
- SharedLRU<hobject_t, FD> registry;
+ SharedLRU<ghobject_t, FD> registry;
CephContext *cct;
public:
@@ -63,16 +63,16 @@ public:
}
typedef std::tr1::shared_ptr<FD> FDRef;
- FDRef lookup(const hobject_t &hoid) {
+ FDRef lookup(const ghobject_t &hoid) {
return registry.lookup(hoid);
}
- FDRef add(const hobject_t &hoid, int fd) {
+ FDRef add(const ghobject_t &hoid, int fd) {
return registry.add(hoid, new FD(fd));
}
/// clear cached fd for hoid, subsequent lookups will get an empty FD
- void clear(const hobject_t &hoid) {
+ void clear(const ghobject_t &hoid) {
registry.clear(hoid);
assert(!registry.lookup(hoid));
}
diff --git a/src/os/FileStore.cc b/src/os/FileStore.cc
index ac51f95006f..20afde9a0dc 100644
--- a/src/os/FileStore.cc
+++ b/src/os/FileStore.cc
@@ -86,6 +86,23 @@ using ceph::crypto::SHA1;
#define REPLAY_GUARD_XATTR "user.cephos.seq"
#define GLOBAL_REPLAY_GUARD_XATTR "user.cephos.gseq"
+//Initial features in new superblock.
+static CompatSet get_fs_initial_compat_set() {
+ CompatSet::FeatureSet ceph_osd_feature_compat;
+ CompatSet::FeatureSet ceph_osd_feature_ro_compat;
+ CompatSet::FeatureSet ceph_osd_feature_incompat;
+ return CompatSet(ceph_osd_feature_compat, ceph_osd_feature_ro_compat,
+ ceph_osd_feature_incompat);
+}
+
+//Features are added here that this FileStore supports.
+static CompatSet get_fs_supported_compat_set() {
+ CompatSet compat = get_fs_initial_compat_set();
+ //Any features here can be set in code, but not in initial superblock
+ compat.incompat.insert(CEPH_FS_FEATURE_INCOMPAT_SHARDS);
+ return compat;
+}
+
void FileStore::FSPerfTracker::update_from_perfcounters(
PerfCounters &logger)
@@ -124,12 +141,12 @@ int FileStore::init_index(coll_t cid)
{
char path[PATH_MAX];
get_cdir(cid, path, sizeof(path));
- int r = index_manager.init_index(cid, path, on_disk_version);
+ int r = index_manager.init_index(cid, path, target_version);
assert(!m_filestore_fail_eio || r != -EIO);
return r;
}
-int FileStore::lfn_find(coll_t cid, const hobject_t& oid, IndexedPath *path)
+int FileStore::lfn_find(coll_t cid, const ghobject_t& oid, IndexedPath *path)
{
Index index;
int r, exist;
@@ -147,20 +164,25 @@ int FileStore::lfn_find(coll_t cid, const hobject_t& oid, IndexedPath *path)
return 0;
}
-int FileStore::lfn_truncate(coll_t cid, const hobject_t& oid, off_t length)
+int FileStore::lfn_truncate(coll_t cid, const ghobject_t& oid, off_t length)
{
IndexedPath path;
- int r = lfn_find(cid, oid, &path);
+ FDRef fd;
+ int r = lfn_open(cid, oid, false, &fd, &path);
if (r < 0)
return r;
- r = ::truncate(path->path(), length);
+ r = ::ftruncate(**fd, length);
if (r < 0)
r = -errno;
+ if (r >= 0 && m_filestore_sloppy_crc) {
+ int rc = backend->_crc_update_truncate(**fd, length);
+ assert(rc >= 0);
+ }
assert(!m_filestore_fail_eio || r != -EIO);
return r;
}
-int FileStore::lfn_stat(coll_t cid, const hobject_t& oid, struct stat *buf)
+int FileStore::lfn_stat(coll_t cid, const ghobject_t& oid, struct stat *buf)
{
IndexedPath path;
int r = lfn_find(cid, oid, &path);
@@ -173,12 +195,15 @@ int FileStore::lfn_stat(coll_t cid, const hobject_t& oid, struct stat *buf)
}
int FileStore::lfn_open(coll_t cid,
- const hobject_t& oid,
+ const ghobject_t& oid,
bool create,
FDRef *outfd,
IndexedPath *path,
Index *index)
{
+ assert(get_allow_sharded_objects() ||
+ ( oid.shard_id == ghobject_t::NO_SHARD &&
+ oid.generation == ghobject_t::NO_GEN ));
assert(outfd);
int flags = O_RDWR;
if (create)
@@ -246,7 +271,7 @@ void FileStore::lfn_close(FDRef fd)
{
}
-int FileStore::lfn_link(coll_t c, coll_t newcid, const hobject_t& o, const hobject_t& newoid)
+int FileStore::lfn_link(coll_t c, coll_t newcid, const ghobject_t& o, const ghobject_t& newoid)
{
Index index_new, index_old;
IndexedPath path_new, path_old;
@@ -298,7 +323,7 @@ int FileStore::lfn_link(coll_t c, coll_t newcid, const hobject_t& o, const hobje
return 0;
}
-int FileStore::lfn_unlink(coll_t cid, const hobject_t& o,
+int FileStore::lfn_unlink(coll_t cid, const ghobject_t& o,
const SequencerPosition &spos,
bool force_clear_omap)
{
@@ -397,7 +422,12 @@ FileStore::FileStore(const std::string &base, const std::string &jdev, const cha
m_filestore_queue_committing_max_ops(g_conf->filestore_queue_committing_max_ops),
m_filestore_queue_committing_max_bytes(g_conf->filestore_queue_committing_max_bytes),
m_filestore_do_dump(false),
- m_filestore_dump_fmt(true)
+ m_filestore_dump_fmt(true),
+ m_filestore_sloppy_crc(g_conf->filestore_sloppy_crc),
+ m_filestore_sloppy_crc_block_size(g_conf->filestore_sloppy_crc_block_size),
+ m_fs_type(FS_TYPE_NONE),
+ m_filestore_max_inline_xattr_size(0),
+ m_filestore_max_inline_xattrs(0)
{
m_filestore_kill_at.set(g_conf->filestore_kill_at);
@@ -447,6 +477,8 @@ FileStore::FileStore(const std::string &base, const std::string &jdev, const cha
generic_backend = new GenericFileStoreBackend(this);
backend = generic_backend;
+
+ superblock.compat_features = get_fs_initial_compat_set();
}
FileStore::~FileStore()
@@ -592,6 +624,13 @@ int FileStore::mkfs()
goto close_fsid_fd;
}
+ ret = write_superblock();
+ if (ret < 0) {
+ derr << "mkfs: write_superblock() failed: "
+ << cpp_strerror(ret) << dendl;
+ goto close_fsid_fd;
+ }
+
struct statfs basefs;
ret = ::fstatfs(basedir_fd, &basefs);
if (ret < 0) {
@@ -791,12 +830,14 @@ int FileStore::_detect_fs()
blk_size = st.f_bsize;
+ m_fs_type = FS_TYPE_OTHER;
#if defined(__linux__)
if (st.f_type == BTRFS_SUPER_MAGIC) {
dout(0) << "mount detected btrfs" << dendl;
backend = new BtrfsFileStoreBackend(this);
wbthrottle.set_fs(WBThrottle::BTRFS);
+ m_fs_type = FS_TYPE_BTRFS;
} else if (st.f_type == XFS_SUPER_MAGIC) {
dout(1) << "mount detected xfs" << dendl;
if (m_filestore_replica_fadvise) {
@@ -804,15 +845,19 @@ int FileStore::_detect_fs()
g_conf->set_val("filestore_replica_fadvise", "false");
g_conf->apply_changes(NULL);
assert(m_filestore_replica_fadvise == false);
+ m_fs_type = FS_TYPE_XFS;
}
}
#endif
#ifdef HAVE_LIBZFS
if (st.f_type == ZFS_SUPER_MAGIC) {
backend = new ZFSFileStoreBackend(this);
+ m_fs_type = FS_TYPE_ZFS;
}
#endif
+ set_xattr_limits_via_conf();
+
r = backend->detect_features();
if (r < 0) {
derr << "_detect_fs: detect_features error: " << cpp_strerror(r) << dendl;
@@ -853,14 +898,7 @@ int FileStore::_detect_fs()
chain_fsetxattr(tmpfd, "user.test4", &buf, sizeof(buf));
ret = chain_fsetxattr(tmpfd, "user.test5", &buf, sizeof(buf));
if (ret == -ENOSPC) {
- if (!g_conf->filestore_xattr_use_omap) {
- dout(0) << "limited size xattrs -- automatically enabling filestore_xattr_use_omap" << dendl;
- g_conf->set_val("filestore_xattr_use_omap", "true");
- g_conf->apply_changes(NULL);
- assert(g_conf->filestore_xattr_use_omap == true);
- } else {
- dout(0) << "limited size xattrs -- filestore_xattr_use_omap already enabled" << dendl;
- }
+ dout(0) << "limited size xattrs" << dendl;
}
chain_fremovexattr(tmpfd, "user.test");
chain_fremovexattr(tmpfd, "user.test2");
@@ -917,6 +955,49 @@ int FileStore::_sanity_check_fs()
return 0;
}
+int FileStore::write_superblock()
+{
+ bufferlist bl;
+ ::encode(superblock, bl);
+ return safe_write_file(basedir.c_str(), "superblock",
+ bl.c_str(), bl.length());
+}
+
+int FileStore::read_superblock()
+{
+ bufferptr bp(PATH_MAX);
+ int ret = safe_read_file(basedir.c_str(), "superblock",
+ bp.c_str(), bp.length());
+ if (ret < 0) {
+ if (ret == -ENOENT) {
+ // If the file doesn't exist write initial CompatSet
+ return write_superblock();
+ }
+ return ret;
+ }
+
+ bufferlist bl;
+ bl.push_back(bp);
+ bufferlist::iterator i = bl.begin();
+ ::decode(superblock, i);
+ return 0;
+}
+
+void FileStore::set_allow_sharded_objects()
+{
+ if (!get_allow_sharded_objects()) {
+ superblock.compat_features.incompat.insert(CEPH_FS_FEATURE_INCOMPAT_SHARDS);
+ int ret = write_superblock();
+ assert(ret == 0); //Should we return error and make caller handle it?
+ }
+ return;
+}
+
+bool FileStore::get_allow_sharded_objects()
+{
+ return superblock.compat_features.incompat.contains(CEPH_FS_FEATURE_INCOMPAT_SHARDS);
+}
+
int FileStore::update_version_stamp()
{
return write_version_stamp();
@@ -924,25 +1005,19 @@ int FileStore::update_version_stamp()
int FileStore::version_stamp_is_valid(uint32_t *version)
{
- char fn[PATH_MAX];
- snprintf(fn, sizeof(fn), "%s/store_version", basedir.c_str());
- int fd = ::open(fn, O_RDONLY, 0644);
- if (fd < 0) {
- if (errno == ENOENT)
+ bufferptr bp(PATH_MAX);
+ int ret = safe_read_file(basedir.c_str(), "store_version",
+ bp.c_str(), bp.length());
+ if (ret < 0) {
+ if (ret == -ENOENT)
return 0;
- else
- return -errno;
+ return ret;
}
- bufferptr bp(PATH_MAX);
- int ret = safe_read(fd, bp.c_str(), bp.length());
- TEMP_FAILURE_RETRY(::close(fd));
- if (ret < 0)
- return -errno;
bufferlist bl;
bl.push_back(bp);
bufferlist::iterator i = bl.begin();
::decode(*version, i);
- if (*version == on_disk_version)
+ if (*version == target_version)
return 1;
else
return 0;
@@ -950,19 +1025,11 @@ int FileStore::version_stamp_is_valid(uint32_t *version)
int FileStore::write_version_stamp()
{
- char fn[PATH_MAX];
- snprintf(fn, sizeof(fn), "%s/store_version", basedir.c_str());
- int fd = ::open(fn, O_WRONLY|O_CREAT|O_TRUNC, 0644);
- if (fd < 0)
- return -errno;
bufferlist bl;
- ::encode(on_disk_version, bl);
-
- int ret = safe_write(fd, bl.c_str(), bl.length());
- TEMP_FAILURE_RETRY(::close(fd));
- if (ret < 0)
- return -errno;
- return 0;
+ ::encode(target_version, bl);
+
+ return safe_write_file(basedir.c_str(), "store_version",
+ bl.c_str(), bl.length());
}
int FileStore::read_op_seq(uint64_t *seq)
@@ -1004,6 +1071,7 @@ int FileStore::mount()
char buf[PATH_MAX];
uint64_t initial_op_seq;
set<string> cluster_snaps;
+ CompatSet supported_compat_set = get_fs_supported_compat_set();
dout(5) << "basedir " << basedir << " journal " << journalpath << dendl;
@@ -1058,12 +1126,26 @@ int FileStore::mount()
ret = -EINVAL;
derr << "FileStore::mount : stale version stamp " << version_stamp
<< ". Please run the FileStore update script before starting the "
- << "OSD, or set filestore_update_to to " << on_disk_version
+ << "OSD, or set filestore_update_to to " << target_version
<< dendl;
goto close_fsid_fd;
}
}
+ ret = read_superblock();
+ if (ret < 0) {
+ ret = -EINVAL;
+ goto close_fsid_fd;
+ }
+
+ // Check if this FileStore supports all the necessary features to mount
+ if (supported_compat_set.compare(superblock.compat_features) == -1) {
+ derr << "FileStore::mount : Incompatible features set "
+ << superblock.compat_features << dendl;
+ ret = -EINVAL;
+ goto close_fsid_fd;
+ }
+
// open some dir handles
basedir_fd = ::open(basedir.c_str(), O_RDONLY);
if (basedir_fd < 0) {
@@ -1711,7 +1793,7 @@ int FileStore::_do_transactions(
for (list<Transaction*>::iterator p = tls.begin();
p != tls.end();
++p, trans_num++) {
- r = _do_transaction(**p, op_seq, trans_num);
+ r = _do_transaction(**p, op_seq, trans_num, handle);
if (r < 0)
break;
if (handle)
@@ -1813,7 +1895,7 @@ void FileStore::_set_replay_guard(coll_t cid,
void FileStore::_set_replay_guard(int fd,
const SequencerPosition& spos,
- const hobject_t *hoid,
+ const ghobject_t *hoid,
bool in_progress)
{
if (backend->can_checkpoint())
@@ -1894,7 +1976,7 @@ void FileStore::_close_replay_guard(int fd, const SequencerPosition& spos)
dout(10) << "_close_replay_guard " << spos << " done" << dendl;
}
-int FileStore::_check_replay_guard(coll_t cid, hobject_t oid, const SequencerPosition& spos)
+int FileStore::_check_replay_guard(coll_t cid, ghobject_t oid, const SequencerPosition& spos)
{
if (!replaying || backend->can_checkpoint())
return 1;
@@ -1973,7 +2055,9 @@ int FileStore::_check_replay_guard(int fd, const SequencerPosition& spos)
}
}
-unsigned FileStore::_do_transaction(Transaction& t, uint64_t op_seq, int trans_num)
+unsigned FileStore::_do_transaction(
+ Transaction& t, uint64_t op_seq, int trans_num,
+ ThreadPool::TPHandle *handle)
{
dout(10) << "_do_transaction on " << &t << dendl;
@@ -1981,6 +2065,9 @@ unsigned FileStore::_do_transaction(Transaction& t, uint64_t op_seq, int trans_n
SequencerPosition spos(op_seq, trans_num, 0);
while (i.have_op()) {
+ if (handle)
+ handle->reset_tp_timeout();
+
int op = i.get_op();
int r = 0;
@@ -1992,7 +2079,7 @@ unsigned FileStore::_do_transaction(Transaction& t, uint64_t op_seq, int trans_n
case Transaction::OP_TOUCH:
{
coll_t cid = i.get_cid();
- hobject_t oid = i.get_oid();
+ ghobject_t oid = i.get_oid();
if (_check_replay_guard(cid, oid, spos) > 0)
r = _touch(cid, oid);
}
@@ -2001,7 +2088,7 @@ unsigned FileStore::_do_transaction(Transaction& t, uint64_t op_seq, int trans_n
case Transaction::OP_WRITE:
{
coll_t cid = i.get_cid();
- hobject_t oid = i.get_oid();
+ ghobject_t oid = i.get_oid();
uint64_t off = i.get_length();
uint64_t len = i.get_length();
bool replica = i.get_replica();
@@ -2015,7 +2102,7 @@ unsigned FileStore::_do_transaction(Transaction& t, uint64_t op_seq, int trans_n
case Transaction::OP_ZERO:
{
coll_t cid = i.get_cid();
- hobject_t oid = i.get_oid();
+ ghobject_t oid = i.get_oid();
uint64_t off = i.get_length();
uint64_t len = i.get_length();
if (_check_replay_guard(cid, oid, spos) > 0)
@@ -2036,7 +2123,7 @@ unsigned FileStore::_do_transaction(Transaction& t, uint64_t op_seq, int trans_n
case Transaction::OP_TRUNCATE:
{
coll_t cid = i.get_cid();
- hobject_t oid = i.get_oid();
+ ghobject_t oid = i.get_oid();
uint64_t off = i.get_length();
if (_check_replay_guard(cid, oid, spos) > 0)
r = _truncate(cid, oid, off);
@@ -2046,7 +2133,7 @@ unsigned FileStore::_do_transaction(Transaction& t, uint64_t op_seq, int trans_n
case Transaction::OP_REMOVE:
{
coll_t cid = i.get_cid();
- hobject_t oid = i.get_oid();
+ ghobject_t oid = i.get_oid();
if (_check_replay_guard(cid, oid, spos) > 0)
r = _remove(cid, oid, spos);
}
@@ -2055,7 +2142,7 @@ unsigned FileStore::_do_transaction(Transaction& t, uint64_t op_seq, int trans_n
case Transaction::OP_SETATTR:
{
coll_t cid = i.get_cid();
- hobject_t oid = i.get_oid();
+ ghobject_t oid = i.get_oid();
string name = i.get_attrname();
bufferlist bl;
i.get_bl(bl);
@@ -2073,7 +2160,7 @@ unsigned FileStore::_do_transaction(Transaction& t, uint64_t op_seq, int trans_n
case Transaction::OP_SETATTRS:
{
coll_t cid = i.get_cid();
- hobject_t oid = i.get_oid();
+ ghobject_t oid = i.get_oid();
map<string, bufferptr> aset;
i.get_attrset(aset);
if (_check_replay_guard(cid, oid, spos) > 0)
@@ -2086,7 +2173,7 @@ unsigned FileStore::_do_transaction(Transaction& t, uint64_t op_seq, int trans_n
case Transaction::OP_RMATTR:
{
coll_t cid = i.get_cid();
- hobject_t oid = i.get_oid();
+ ghobject_t oid = i.get_oid();
string name = i.get_attrname();
if (_check_replay_guard(cid, oid, spos) > 0)
r = _rmattr(cid, oid, name.c_str(), spos);
@@ -2096,7 +2183,7 @@ unsigned FileStore::_do_transaction(Transaction& t, uint64_t op_seq, int trans_n
case Transaction::OP_RMATTRS:
{
coll_t cid = i.get_cid();
- hobject_t oid = i.get_oid();
+ ghobject_t oid = i.get_oid();
if (_check_replay_guard(cid, oid, spos) > 0)
r = _rmattrs(cid, oid, spos);
}
@@ -2105,8 +2192,8 @@ unsigned FileStore::_do_transaction(Transaction& t, uint64_t op_seq, int trans_n
case Transaction::OP_CLONE:
{
coll_t cid = i.get_cid();
- hobject_t oid = i.get_oid();
- hobject_t noid = i.get_oid();
+ ghobject_t oid = i.get_oid();
+ ghobject_t noid = i.get_oid();
r = _clone(cid, oid, noid, spos);
}
break;
@@ -2114,8 +2201,8 @@ unsigned FileStore::_do_transaction(Transaction& t, uint64_t op_seq, int trans_n
case Transaction::OP_CLONERANGE:
{
coll_t cid = i.get_cid();
- hobject_t oid = i.get_oid();
- hobject_t noid = i.get_oid();
+ ghobject_t oid = i.get_oid();
+ ghobject_t noid = i.get_oid();
uint64_t off = i.get_length();
uint64_t len = i.get_length();
r = _clone_range(cid, oid, noid, off, len, off, spos);
@@ -2125,8 +2212,8 @@ unsigned FileStore::_do_transaction(Transaction& t, uint64_t op_seq, int trans_n
case Transaction::OP_CLONERANGE2:
{
coll_t cid = i.get_cid();
- hobject_t oid = i.get_oid();
- hobject_t noid = i.get_oid();
+ ghobject_t oid = i.get_oid();
+ ghobject_t noid = i.get_oid();
uint64_t srcoff = i.get_length();
uint64_t len = i.get_length();
uint64_t dstoff = i.get_length();
@@ -2154,7 +2241,7 @@ unsigned FileStore::_do_transaction(Transaction& t, uint64_t op_seq, int trans_n
{
coll_t ncid = i.get_cid();
coll_t ocid = i.get_cid();
- hobject_t oid = i.get_oid();
+ ghobject_t oid = i.get_oid();
r = _collection_add(ncid, ocid, oid, spos);
}
break;
@@ -2162,7 +2249,7 @@ unsigned FileStore::_do_transaction(Transaction& t, uint64_t op_seq, int trans_n
case Transaction::OP_COLL_REMOVE:
{
coll_t cid = i.get_cid();
- hobject_t oid = i.get_oid();
+ ghobject_t oid = i.get_oid();
if (_check_replay_guard(cid, oid, spos) > 0)
r = _remove(cid, oid, spos);
}
@@ -2173,7 +2260,7 @@ unsigned FileStore::_do_transaction(Transaction& t, uint64_t op_seq, int trans_n
// WARNING: this is deprecated and buggy; only here to replay old journals.
coll_t ocid = i.get_cid();
coll_t ncid = i.get_cid();
- hobject_t oid = i.get_oid();
+ ghobject_t oid = i.get_oid();
r = _collection_add(ocid, ncid, oid, spos);
if (r == 0 &&
(_check_replay_guard(ocid, oid, spos) > 0))
@@ -2184,9 +2271,9 @@ unsigned FileStore::_do_transaction(Transaction& t, uint64_t op_seq, int trans_n
case Transaction::OP_COLL_MOVE_RENAME:
{
coll_t oldcid = i.get_cid();
- hobject_t oldoid = i.get_oid();
+ ghobject_t oldoid = i.get_oid();
coll_t newcid = i.get_cid();
- hobject_t newoid = i.get_oid();
+ ghobject_t newoid = i.get_oid();
r = _collection_move_rename(oldcid, oldoid, newcid, newoid, spos);
}
break;
@@ -2226,14 +2313,14 @@ unsigned FileStore::_do_transaction(Transaction& t, uint64_t op_seq, int trans_n
case Transaction::OP_OMAP_CLEAR:
{
coll_t cid(i.get_cid());
- hobject_t oid = i.get_oid();
+ ghobject_t oid = i.get_oid();
r = _omap_clear(cid, oid, spos);
}
break;
case Transaction::OP_OMAP_SETKEYS:
{
coll_t cid(i.get_cid());
- hobject_t oid = i.get_oid();
+ ghobject_t oid = i.get_oid();
map<string, bufferlist> aset;
i.get_attrset(aset);
r = _omap_setkeys(cid, oid, aset, spos);
@@ -2242,7 +2329,7 @@ unsigned FileStore::_do_transaction(Transaction& t, uint64_t op_seq, int trans_n
case Transaction::OP_OMAP_RMKEYS:
{
coll_t cid(i.get_cid());
- hobject_t oid = i.get_oid();
+ ghobject_t oid = i.get_oid();
set<string> keys;
i.get_keyset(keys);
r = _omap_rmkeys(cid, oid, keys, spos);
@@ -2251,7 +2338,7 @@ unsigned FileStore::_do_transaction(Transaction& t, uint64_t op_seq, int trans_n
case Transaction::OP_OMAP_RMKEYRANGE:
{
coll_t cid(i.get_cid());
- hobject_t oid = i.get_oid();
+ ghobject_t oid = i.get_oid();
string first, last;
first = i.get_key();
last = i.get_key();
@@ -2261,7 +2348,7 @@ unsigned FileStore::_do_transaction(Transaction& t, uint64_t op_seq, int trans_n
case Transaction::OP_OMAP_SETHEADER:
{
coll_t cid(i.get_cid());
- hobject_t oid = i.get_oid();
+ ghobject_t oid = i.get_oid();
bufferlist bl;
i.get_bl(bl);
r = _omap_setheader(cid, oid, bl, spos);
@@ -2381,7 +2468,7 @@ unsigned FileStore::_do_transaction(Transaction& t, uint64_t op_seq, int trans_n
// --------------------
// objects
-bool FileStore::exists(coll_t cid, const hobject_t& oid)
+bool FileStore::exists(coll_t cid, const ghobject_t& oid)
{
struct stat st;
if (stat(cid, oid, &st) == 0)
@@ -2391,7 +2478,7 @@ bool FileStore::exists(coll_t cid, const hobject_t& oid)
}
int FileStore::stat(
- coll_t cid, const hobject_t& oid, struct stat *st, bool allow_eio)
+ coll_t cid, const ghobject_t& oid, struct stat *st, bool allow_eio)
{
int r = lfn_stat(cid, oid, st);
assert(allow_eio || !m_filestore_fail_eio || r != -EIO);
@@ -2413,7 +2500,7 @@ int FileStore::stat(
int FileStore::read(
coll_t cid,
- const hobject_t& oid,
+ const ghobject_t& oid,
uint64_t offset,
size_t len,
bufferlist& bl,
@@ -2449,6 +2536,17 @@ int FileStore::read(
}
bptr.set_length(got); // properly size the buffer
bl.push_back(bptr); // put it in the target bufferlist
+
+ if (m_filestore_sloppy_crc && (!replaying || backend->can_checkpoint())) {
+ ostringstream ss;
+ int errors = backend->_crc_verify_read(**fd, offset, got, bl, &ss);
+ if (errors > 0) {
+ dout(0) << "FileStore::read " << cid << "/" << oid << " " << offset << "~"
+ << got << " ... BAD CRC:\n" << ss.str() << dendl;
+ assert(0 == "bad crc on read");
+ }
+ }
+
lfn_close(fd);
dout(10) << "FileStore::read " << cid << "/" << oid << " " << offset << "~"
@@ -2461,7 +2559,7 @@ int FileStore::read(
}
}
-int FileStore::fiemap(coll_t cid, const hobject_t& oid,
+int FileStore::fiemap(coll_t cid, const ghobject_t& oid,
uint64_t offset, size_t len,
bufferlist& bl)
{
@@ -2539,7 +2637,7 @@ done:
}
-int FileStore::_remove(coll_t cid, const hobject_t& oid,
+int FileStore::_remove(coll_t cid, const ghobject_t& oid,
const SequencerPosition &spos)
{
dout(15) << "remove " << cid << "/" << oid << dendl;
@@ -2548,7 +2646,7 @@ int FileStore::_remove(coll_t cid, const hobject_t& oid,
return r;
}
-int FileStore::_truncate(coll_t cid, const hobject_t& oid, uint64_t size)
+int FileStore::_truncate(coll_t cid, const ghobject_t& oid, uint64_t size)
{
dout(15) << "truncate " << cid << "/" << oid << " size " << size << dendl;
int r = lfn_truncate(cid, oid, size);
@@ -2557,7 +2655,7 @@ int FileStore::_truncate(coll_t cid, const hobject_t& oid, uint64_t size)
}
-int FileStore::_touch(coll_t cid, const hobject_t& oid)
+int FileStore::_touch(coll_t cid, const ghobject_t& oid)
{
dout(15) << "touch " << cid << "/" << oid << dendl;
@@ -2572,7 +2670,7 @@ int FileStore::_touch(coll_t cid, const hobject_t& oid)
return r;
}
-int FileStore::_write(coll_t cid, const hobject_t& oid,
+int FileStore::_write(coll_t cid, const ghobject_t& oid,
uint64_t offset, size_t len,
const bufferlist& bl, bool replica)
{
@@ -2610,6 +2708,11 @@ int FileStore::_write(coll_t cid, const hobject_t& oid,
if (r == 0)
r = bl.length();
+ if (r >= 0 && m_filestore_sloppy_crc) {
+ int rc = backend->_crc_update_write(**fd, offset, len, bl);
+ assert(rc >= 0);
+ }
+
// flush?
if (!replaying &&
g_conf->filestore_wbthrottle_enable)
@@ -2621,7 +2724,7 @@ int FileStore::_write(coll_t cid, const hobject_t& oid,
return r;
}
-int FileStore::_zero(coll_t cid, const hobject_t& oid, uint64_t offset, size_t len)
+int FileStore::_zero(coll_t cid, const ghobject_t& oid, uint64_t offset, size_t len)
{
dout(15) << "zero " << cid << "/" << oid << " " << offset << "~" << len << dendl;
int ret = 0;
@@ -2641,6 +2744,11 @@ int FileStore::_zero(coll_t cid, const hobject_t& oid, uint64_t offset, size_t l
ret = -errno;
lfn_close(fd);
+ if (ret >= 0 && m_filestore_sloppy_crc) {
+ int rc = backend->_crc_update_zero(**fd, offset, len);
+ assert(rc >= 0);
+ }
+
if (ret == 0)
goto out; // yay!
if (ret != -EOPNOTSUPP)
@@ -2664,7 +2772,7 @@ int FileStore::_zero(coll_t cid, const hobject_t& oid, uint64_t offset, size_t l
return ret;
}
-int FileStore::_clone(coll_t cid, const hobject_t& oldoid, const hobject_t& newoid,
+int FileStore::_clone(coll_t cid, const ghobject_t& oldoid, const ghobject_t& newoid,
const SequencerPosition& spos)
{
dout(15) << "clone " << cid << "/" << oldoid << " -> " << cid << "/" << newoid << dendl;
@@ -2794,11 +2902,15 @@ int FileStore::_do_copy_range(int from, int to, uint64_t srcoff, uint64_t len, u
break;
pos += r;
}
+ if (r >= 0 && m_filestore_sloppy_crc) {
+ int rc = backend->_crc_update_clone_range(from, to, srcoff, len, dstoff);
+ assert(rc >= 0);
+ }
dout(20) << "_do_copy_range " << srcoff << "~" << len << " to " << dstoff << " = " << r << dendl;
return r;
}
-int FileStore::_clone_range(coll_t cid, const hobject_t& oldoid, const hobject_t& newoid,
+int FileStore::_clone_range(coll_t cid, const ghobject_t& oldoid, const ghobject_t& newoid,
uint64_t srcoff, uint64_t len, uint64_t dstoff,
const SequencerPosition& spos)
{
@@ -3239,23 +3351,23 @@ int FileStore::_fsetattrs(int fd, map<string, bufferptr> &aset)
}
// debug EIO injection
-void FileStore::inject_data_error(const hobject_t &oid) {
+void FileStore::inject_data_error(const ghobject_t &oid) {
Mutex::Locker l(read_error_lock);
dout(10) << __func__ << ": init error on " << oid << dendl;
data_error_set.insert(oid);
}
-void FileStore::inject_mdata_error(const hobject_t &oid) {
+void FileStore::inject_mdata_error(const ghobject_t &oid) {
Mutex::Locker l(read_error_lock);
dout(10) << __func__ << ": init error on " << oid << dendl;
mdata_error_set.insert(oid);
}
-void FileStore::debug_obj_on_delete(const hobject_t &oid) {
+void FileStore::debug_obj_on_delete(const ghobject_t &oid) {
Mutex::Locker l(read_error_lock);
dout(10) << __func__ << ": clear error on " << oid << dendl;
data_error_set.erase(oid);
mdata_error_set.erase(oid);
}
-bool FileStore::debug_data_eio(const hobject_t &oid) {
+bool FileStore::debug_data_eio(const ghobject_t &oid) {
Mutex::Locker l(read_error_lock);
if (data_error_set.count(oid)) {
dout(10) << __func__ << ": inject error on " << oid << dendl;
@@ -3264,7 +3376,7 @@ bool FileStore::debug_data_eio(const hobject_t &oid) {
return false;
}
}
-bool FileStore::debug_mdata_eio(const hobject_t &oid) {
+bool FileStore::debug_mdata_eio(const ghobject_t &oid) {
Mutex::Locker l(read_error_lock);
if (mdata_error_set.count(oid)) {
dout(10) << __func__ << ": inject error on " << oid << dendl;
@@ -3277,7 +3389,7 @@ bool FileStore::debug_mdata_eio(const hobject_t &oid) {
// objects
-int FileStore::getattr(coll_t cid, const hobject_t& oid, const char *name, bufferptr &bp)
+int FileStore::getattr(coll_t cid, const ghobject_t& oid, const char *name, bufferptr &bp)
{
dout(15) << "getattr " << cid << "/" << oid << " '" << name << "'" << dendl;
FDRef fd;
@@ -3289,7 +3401,7 @@ int FileStore::getattr(coll_t cid, const hobject_t& oid, const char *name, buffe
get_attrname(name, n, CHAIN_XATTR_MAX_NAME_LEN);
r = _fgetattr(**fd, n, bp);
lfn_close(fd);
- if (r == -ENODATA && g_conf->filestore_xattr_use_omap) {
+ if (r == -ENODATA) {
map<string, bufferlist> got;
set<string> to_get;
to_get.insert(string(name));
@@ -3323,8 +3435,11 @@ int FileStore::getattr(coll_t cid, const hobject_t& oid, const char *name, buffe
}
}
-int FileStore::getattrs(coll_t cid, const hobject_t& oid, map<string,bufferptr>& aset, bool user_only)
+int FileStore::getattrs(coll_t cid, const ghobject_t& oid, map<string,bufferptr>& aset, bool user_only)
{
+ set<string> omap_attrs;
+ map<string, bufferlist> omap_aset;
+ Index index;
dout(15) << "getattrs " << cid << "/" << oid << dendl;
FDRef fd;
int r = lfn_open(cid, oid, false, &fd);
@@ -3332,43 +3447,43 @@ int FileStore::getattrs(coll_t cid, const hobject_t& oid, map<string,bufferptr>&
goto out;
}
r = _fgetattrs(**fd, aset, user_only);
+ if (r < 0) {
+ goto out;
+ }
lfn_close(fd);
- if (g_conf->filestore_xattr_use_omap) {
- set<string> omap_attrs;
- map<string, bufferlist> omap_aset;
- Index index;
- int r = get_index(cid, &index);
- if (r < 0) {
- dout(10) << __func__ << " could not get index r = " << r << dendl;
- goto out;
- }
- r = object_map->get_all_xattrs(oid, &omap_attrs);
- if (r < 0 && r != -ENOENT) {
- dout(10) << __func__ << " could not get omap_attrs r = " << r << dendl;
- goto out;
- }
- r = object_map->get_xattrs(oid, omap_attrs, &omap_aset);
- if (r < 0 && r != -ENOENT) {
- dout(10) << __func__ << " could not get omap_attrs r = " << r << dendl;
- goto out;
- }
- assert(omap_attrs.size() == omap_aset.size());
- for (map<string, bufferlist>::iterator i = omap_aset.begin();
+ r = get_index(cid, &index);
+ if (r < 0) {
+ dout(10) << __func__ << " could not get index r = " << r << dendl;
+ goto out;
+ }
+ r = object_map->get_all_xattrs(oid, &omap_attrs);
+ if (r < 0 && r != -ENOENT) {
+ dout(10) << __func__ << " could not get omap_attrs r = " << r << dendl;
+ goto out;
+ }
+ r = object_map->get_xattrs(oid, omap_attrs, &omap_aset);
+ if (r < 0 && r != -ENOENT) {
+ dout(10) << __func__ << " could not get omap_attrs r = " << r << dendl;
+ goto out;
+ }
+ if (r == -ENOENT)
+ r = 0;
+ assert(omap_attrs.size() == omap_aset.size());
+ for (map<string, bufferlist>::iterator i = omap_aset.begin();
i != omap_aset.end();
++i) {
- string key;
- if (user_only) {
+ string key;
+ if (user_only) {
if (i->first[0] != '_')
continue;
if (i->first == "_")
continue;
key = i->first.substr(1, i->first.size());
- } else {
+ } else {
key = i->first;
- }
- aset.insert(make_pair(key,
- bufferptr(i->second.c_str(), i->second.length())));
}
+ aset.insert(make_pair(key,
+ bufferptr(i->second.c_str(), i->second.length())));
}
out:
dout(10) << "getattrs " << cid << "/" << oid << " = " << r << dendl;
@@ -3382,7 +3497,7 @@ int FileStore::getattrs(coll_t cid, const hobject_t& oid, map<string,bufferptr>&
}
}
-int FileStore::_setattrs(coll_t cid, const hobject_t& oid, map<string,bufferptr>& aset,
+int FileStore::_setattrs(coll_t cid, const ghobject_t& oid, map<string,bufferptr>& aset,
const SequencerPosition &spos)
{
map<string, bufferlist> omap_set;
@@ -3394,10 +3509,8 @@ int FileStore::_setattrs(coll_t cid, const hobject_t& oid, map<string,bufferptr>
if (r < 0) {
goto out;
}
- if (g_conf->filestore_xattr_use_omap) {
- r = _fgetattrs(**fd, inline_set, false);
- assert(!m_filestore_fail_eio || r != -EIO);
- }
+ r = _fgetattrs(**fd, inline_set, false);
+ assert(!m_filestore_fail_eio || r != -EIO);
dout(15) << "setattrs " << cid << "/" << oid << dendl;
r = 0;
for (map<string,bufferptr>::iterator p = aset.begin();
@@ -3405,8 +3518,8 @@ int FileStore::_setattrs(coll_t cid, const hobject_t& oid, map<string,bufferptr>
++p) {
char n[CHAIN_XATTR_MAX_NAME_LEN];
get_attrname(p->first.c_str(), n, CHAIN_XATTR_MAX_NAME_LEN);
- if (g_conf->filestore_xattr_use_omap) {
- if (p->second.length() > g_conf->filestore_max_inline_xattr_size) {
+
+ if (p->second.length() > m_filestore_max_inline_xattr_size) {
if (inline_set.count(p->first)) {
inline_set.erase(p->first);
r = chain_fremovexattr(**fd, n);
@@ -3415,10 +3528,10 @@ int FileStore::_setattrs(coll_t cid, const hobject_t& oid, map<string,bufferptr>
}
omap_set[p->first].push_back(p->second);
continue;
- }
+ }
- if (!inline_set.count(p->first) &&
- inline_set.size() >= g_conf->filestore_max_inline_xattrs) {
+ if (!inline_set.count(p->first) &&
+ inline_set.size() >= m_filestore_max_inline_xattrs) {
if (inline_set.count(p->first)) {
inline_set.erase(p->first);
r = chain_fremovexattr(**fd, n);
@@ -3427,10 +3540,9 @@ int FileStore::_setattrs(coll_t cid, const hobject_t& oid, map<string,bufferptr>
}
omap_set[p->first].push_back(p->second);
continue;
- }
- omap_remove.insert(p->first);
- inline_set.insert(*p);
}
+ omap_remove.insert(p->first);
+ inline_set.insert(*p);
inline_to_set.insert(*p);
@@ -3441,17 +3553,17 @@ int FileStore::_setattrs(coll_t cid, const hobject_t& oid, map<string,bufferptr>
goto out_close;
if (!omap_remove.empty()) {
- assert(g_conf->filestore_xattr_use_omap);
r = object_map->remove_xattrs(oid, omap_remove, &spos);
if (r < 0 && r != -ENOENT) {
dout(10) << __func__ << " could not remove_xattrs r = " << r << dendl;
assert(!m_filestore_fail_eio || r != -EIO);
goto out_close;
+ } else {
+ r = 0; // don't confuse the debug output
}
}
if (!omap_set.empty()) {
- assert(g_conf->filestore_xattr_use_omap);
r = object_map->set_xattrs(oid, omap_set, &spos);
if (r < 0) {
dout(10) << __func__ << " could not set_xattrs r = " << r << dendl;
@@ -3467,7 +3579,7 @@ int FileStore::_setattrs(coll_t cid, const hobject_t& oid, map<string,bufferptr>
}
-int FileStore::_rmattr(coll_t cid, const hobject_t& oid, const char *name,
+int FileStore::_rmattr(coll_t cid, const ghobject_t& oid, const char *name,
const SequencerPosition &spos)
{
dout(15) << "rmattr " << cid << "/" << oid << " '" << name << "'" << dendl;
@@ -3479,7 +3591,7 @@ int FileStore::_rmattr(coll_t cid, const hobject_t& oid, const char *name,
char n[CHAIN_XATTR_MAX_NAME_LEN];
get_attrname(name, n, CHAIN_XATTR_MAX_NAME_LEN);
r = chain_fremovexattr(**fd, n);
- if (r == -ENODATA && g_conf->filestore_xattr_use_omap) {
+ if (r == -ENODATA) {
Index index;
r = get_index(cid, &index);
if (r < 0) {
@@ -3502,13 +3614,15 @@ int FileStore::_rmattr(coll_t cid, const hobject_t& oid, const char *name,
return r;
}
-int FileStore::_rmattrs(coll_t cid, const hobject_t& oid,
+int FileStore::_rmattrs(coll_t cid, const ghobject_t& oid,
const SequencerPosition &spos)
{
dout(15) << "rmattrs " << cid << "/" << oid << dendl;
map<string,bufferptr> aset;
FDRef fd;
+ set<string> omap_attrs;
+ Index index;
int r = lfn_open(cid, oid, false, &fd);
if (r < 0) {
goto out;
@@ -3525,26 +3639,24 @@ int FileStore::_rmattrs(coll_t cid, const hobject_t& oid,
}
lfn_close(fd);
- if (g_conf->filestore_xattr_use_omap) {
- set<string> omap_attrs;
- Index index;
- r = get_index(cid, &index);
- if (r < 0) {
- dout(10) << __func__ << " could not get index r = " << r << dendl;
- return r;
- }
- r = object_map->get_all_xattrs(oid, &omap_attrs);
- if (r < 0 && r != -ENOENT) {
- dout(10) << __func__ << " could not get omap_attrs r = " << r << dendl;
- assert(!m_filestore_fail_eio || r != -EIO);
- return r;
- }
- r = object_map->remove_xattrs(oid, omap_attrs, &spos);
- if (r < 0 && r != -ENOENT) {
- dout(10) << __func__ << " could not remove omap_attrs r = " << r << dendl;
- return r;
- }
+ r = get_index(cid, &index);
+ if (r < 0) {
+ dout(10) << __func__ << " could not get index r = " << r << dendl;
+ return r;
+ }
+ r = object_map->get_all_xattrs(oid, &omap_attrs);
+ if (r < 0 && r != -ENOENT) {
+ dout(10) << __func__ << " could not get omap_attrs r = " << r << dendl;
+ assert(!m_filestore_fail_eio || r != -EIO);
+ return r;
+ }
+ r = object_map->remove_xattrs(oid, omap_attrs, &spos);
+ if (r < 0 && r != -ENOENT) {
+ dout(10) << __func__ << " could not remove omap_attrs r = " << r << dendl;
+ return r;
}
+ if (r == -ENOENT)
+ r = 0;
out:
dout(10) << "rmattrs " << cid << "/" << oid << " = " << r << dendl;
return r;
@@ -3698,14 +3810,14 @@ int FileStore::_collection_remove_recursive(const coll_t &cid,
return r;
}
- vector<hobject_t> objects;
- hobject_t max;
+ vector<ghobject_t> objects;
+ ghobject_t max;
r = 0;
while (!max.is_max()) {
r = collection_list_partial(cid, max, 200, 300, 0, &objects, &max);
if (r < 0)
return r;
- for (vector<hobject_t>::iterator i = objects.begin();
+ for (vector<ghobject_t>::iterator i = objects.begin();
i != objects.end();
++i) {
assert(_check_replay_guard(cid, *i, spos));
@@ -3777,7 +3889,7 @@ int FileStore::collection_version_current(coll_t c, uint32_t *version)
if (r < 0)
return r;
*version = index->collection_version();
- if (*version == on_disk_version)
+ if (*version == target_version)
return 1;
else
return 0;
@@ -3870,9 +3982,9 @@ bool FileStore::collection_empty(coll_t c)
int r = get_index(c, &index);
if (r < 0)
return false;
- vector<hobject_t> ls;
+ vector<ghobject_t> ls;
collection_list_handle_t handle;
- r = index->collection_list_partial(hobject_t(), 1, 1, 0, &ls, NULL);
+ r = index->collection_list_partial(ghobject_t(), 1, 1, 0, &ls, NULL);
if (r < 0) {
assert(!m_filestore_fail_eio || r != -EIO);
return false;
@@ -3880,14 +3992,14 @@ bool FileStore::collection_empty(coll_t c)
return ls.empty();
}
-int FileStore::collection_list_range(coll_t c, hobject_t start, hobject_t end,
- snapid_t seq, vector<hobject_t> *ls)
+int FileStore::collection_list_range(coll_t c, ghobject_t start, ghobject_t end,
+ snapid_t seq, vector<ghobject_t> *ls)
{
bool done = false;
- hobject_t next = start;
+ ghobject_t next = start;
while (!done) {
- vector<hobject_t> next_objects;
+ vector<ghobject_t> next_objects;
int r = collection_list_partial(c, next,
get_ideal_list_min(), get_ideal_list_max(),
seq, &next_objects, &next);
@@ -3914,10 +4026,11 @@ int FileStore::collection_list_range(coll_t c, hobject_t start, hobject_t end,
return 0;
}
-int FileStore::collection_list_partial(coll_t c, hobject_t start,
+int FileStore::collection_list_partial(coll_t c, ghobject_t start,
int min, int max, snapid_t seq,
- vector<hobject_t> *ls, hobject_t *next)
+ vector<ghobject_t> *ls, ghobject_t *next)
{
+ dout(10) << "collection_list_partial: " << c << dendl;
Index index;
int r = get_index(c, &index);
if (r < 0)
@@ -3929,10 +4042,12 @@ int FileStore::collection_list_partial(coll_t c, hobject_t start,
assert(!m_filestore_fail_eio || r != -EIO);
return r;
}
+ if (ls)
+ dout(20) << "objects: " << *ls << dendl;
return 0;
}
-int FileStore::collection_list(coll_t c, vector<hobject_t>& ls)
+int FileStore::collection_list(coll_t c, vector<ghobject_t>& ls)
{
Index index;
int r = get_index(c, &index);
@@ -3943,7 +4058,7 @@ int FileStore::collection_list(coll_t c, vector<hobject_t>& ls)
return r;
}
-int FileStore::omap_get(coll_t c, const hobject_t &hoid,
+int FileStore::omap_get(coll_t c, const ghobject_t &hoid,
bufferlist *header,
map<string, bufferlist> *out)
{
@@ -3962,7 +4077,7 @@ int FileStore::omap_get(coll_t c, const hobject_t &hoid,
int FileStore::omap_get_header(
coll_t c,
- const hobject_t &hoid,
+ const ghobject_t &hoid,
bufferlist *bl,
bool allow_eio)
{
@@ -3979,7 +4094,7 @@ int FileStore::omap_get_header(
return 0;
}
-int FileStore::omap_get_keys(coll_t c, const hobject_t &hoid, set<string> *keys)
+int FileStore::omap_get_keys(coll_t c, const ghobject_t &hoid, set<string> *keys)
{
dout(15) << __func__ << " " << c << "/" << hoid << dendl;
IndexedPath path;
@@ -3994,7 +4109,7 @@ int FileStore::omap_get_keys(coll_t c, const hobject_t &hoid, set<string> *keys)
return 0;
}
-int FileStore::omap_get_values(coll_t c, const hobject_t &hoid,
+int FileStore::omap_get_values(coll_t c, const ghobject_t &hoid,
const set<string> &keys,
map<string, bufferlist> *out)
{
@@ -4011,7 +4126,7 @@ int FileStore::omap_get_values(coll_t c, const hobject_t &hoid,
return 0;
}
-int FileStore::omap_check_keys(coll_t c, const hobject_t &hoid,
+int FileStore::omap_check_keys(coll_t c, const ghobject_t &hoid,
const set<string> &keys,
set<string> *out)
{
@@ -4029,7 +4144,7 @@ int FileStore::omap_check_keys(coll_t c, const hobject_t &hoid,
}
ObjectMap::ObjectMapIterator FileStore::get_omap_iterator(coll_t c,
- const hobject_t &hoid)
+ const ghobject_t &hoid)
{
dout(15) << __func__ << " " << c << "/" << hoid << dendl;
IndexedPath path;
@@ -4100,7 +4215,7 @@ int FileStore::_destroy_collection(coll_t c)
}
-int FileStore::_collection_add(coll_t c, coll_t oldcid, const hobject_t& o,
+int FileStore::_collection_add(coll_t c, coll_t oldcid, const ghobject_t& o,
const SequencerPosition& spos)
{
dout(15) << "collection_add " << c << "/" << o << " from " << oldcid << "/" << o << dendl;
@@ -4148,8 +4263,8 @@ int FileStore::_collection_add(coll_t c, coll_t oldcid, const hobject_t& o,
return r;
}
-int FileStore::_collection_move_rename(coll_t oldcid, const hobject_t& oldoid,
- coll_t c, const hobject_t& o,
+int FileStore::_collection_move_rename(coll_t oldcid, const ghobject_t& oldoid,
+ coll_t c, const ghobject_t& o,
const SequencerPosition& spos)
{
dout(15) << __func__ << " " << c << "/" << o << " from " << oldcid << "/" << oldoid << dendl;
@@ -4228,7 +4343,7 @@ void FileStore::_inject_failure()
}
}
-int FileStore::_omap_clear(coll_t cid, const hobject_t &hoid,
+int FileStore::_omap_clear(coll_t cid, const ghobject_t &hoid,
const SequencerPosition &spos) {
dout(15) << __func__ << " " << cid << "/" << hoid << dendl;
IndexedPath path;
@@ -4241,7 +4356,7 @@ int FileStore::_omap_clear(coll_t cid, const hobject_t &hoid,
return 0;
}
-int FileStore::_omap_setkeys(coll_t cid, const hobject_t &hoid,
+int FileStore::_omap_setkeys(coll_t cid, const ghobject_t &hoid,
const map<string, bufferlist> &aset,
const SequencerPosition &spos) {
dout(15) << __func__ << " " << cid << "/" << hoid << dendl;
@@ -4252,7 +4367,7 @@ int FileStore::_omap_setkeys(coll_t cid, const hobject_t &hoid,
return object_map->set_keys(hoid, aset, &spos);
}
-int FileStore::_omap_rmkeys(coll_t cid, const hobject_t &hoid,
+int FileStore::_omap_rmkeys(coll_t cid, const ghobject_t &hoid,
const set<string> &keys,
const SequencerPosition &spos) {
dout(15) << __func__ << " " << cid << "/" << hoid << dendl;
@@ -4266,7 +4381,7 @@ int FileStore::_omap_rmkeys(coll_t cid, const hobject_t &hoid,
return 0;
}
-int FileStore::_omap_rmkeyrange(coll_t cid, const hobject_t &hoid,
+int FileStore::_omap_rmkeyrange(coll_t cid, const ghobject_t &hoid,
const string& first, const string& last,
const SequencerPosition &spos) {
dout(15) << __func__ << " " << cid << "/" << hoid << " [" << first << "," << last << "]" << dendl;
@@ -4283,7 +4398,7 @@ int FileStore::_omap_rmkeyrange(coll_t cid, const hobject_t &hoid,
return _omap_rmkeys(cid, hoid, keys, spos);
}
-int FileStore::_omap_setheader(coll_t cid, const hobject_t &hoid,
+int FileStore::_omap_setheader(coll_t cid, const ghobject_t &hoid,
const bufferlist &bl,
const SequencerPosition &spos)
{
@@ -4343,8 +4458,8 @@ int FileStore::_split_collection(coll_t cid,
_close_replay_guard(dest, spos);
}
if (g_conf->filestore_debug_verify_split) {
- vector<hobject_t> objects;
- hobject_t next;
+ vector<ghobject_t> objects;
+ ghobject_t next;
while (1) {
collection_list_partial(
cid,
@@ -4354,7 +4469,7 @@ int FileStore::_split_collection(coll_t cid,
&next);
if (objects.empty())
break;
- for (vector<hobject_t>::iterator i = objects.begin();
+ for (vector<ghobject_t>::iterator i = objects.begin();
i != objects.end();
++i) {
dout(20) << __func__ << ": " << *i << " still in source "
@@ -4363,7 +4478,7 @@ int FileStore::_split_collection(coll_t cid,
}
objects.clear();
}
- next = hobject_t();
+ next = ghobject_t();
while (1) {
collection_list_partial(
dest,
@@ -4373,7 +4488,7 @@ int FileStore::_split_collection(coll_t cid,
&next);
if (objects.empty())
break;
- for (vector<hobject_t>::iterator i = objects.begin();
+ for (vector<ghobject_t>::iterator i = objects.begin();
i != objects.end();
++i) {
dout(20) << __func__ << ": " << *i << " now in dest "
@@ -4439,6 +4554,8 @@ const char** FileStore::get_tracked_conf_keys() const
"filestore_kill_at",
"filestore_fail_eio",
"filestore_replica_fadvise",
+ "filestore_sloppy_crc",
+ "filestore_sloppy_crc_block_size",
NULL
};
return KEYS;
@@ -4447,6 +4564,17 @@ const char** FileStore::get_tracked_conf_keys() const
void FileStore::handle_conf_change(const struct md_config_t *conf,
const std::set <std::string> &changed)
{
+ if (changed.count("filestore_max_inline_xattr_size") ||
+ changed.count("filestore_max_inline_xattr_size_xfs") ||
+ changed.count("filestore_max_inline_xattr_size_btrfs") ||
+ changed.count("filestore_max_inline_xattr_size_other") ||
+ changed.count("filestore_max_inline_xattrs") ||
+ changed.count("filestore_max_inline_xattrs_xfs") ||
+ changed.count("filestore_max_inline_xattrs_btrfs") ||
+ changed.count("filestore_max_inline_xattrs_other")) {
+ Mutex::Locker l(lock);
+ set_xattr_limits_via_conf();
+ }
if (changed.count("filestore_min_sync_interval") ||
changed.count("filestore_max_sync_interval") ||
changed.count("filestore_queue_max_ops") ||
@@ -4455,6 +4583,8 @@ void FileStore::handle_conf_change(const struct md_config_t *conf,
changed.count("filestore_queue_committing_max_bytes") ||
changed.count("filestore_kill_at") ||
changed.count("filestore_fail_eio") ||
+ changed.count("filestore_sloppy_crc") ||
+ changed.count("filestore_sloppy_crc_block_size") ||
changed.count("filestore_replica_fadvise")) {
Mutex::Locker l(lock);
m_filestore_min_sync_interval = conf->filestore_min_sync_interval;
@@ -4466,6 +4596,8 @@ void FileStore::handle_conf_change(const struct md_config_t *conf,
m_filestore_kill_at.set(conf->filestore_kill_at);
m_filestore_fail_eio = conf->filestore_fail_eio;
m_filestore_replica_fadvise = conf->filestore_replica_fadvise;
+ m_filestore_sloppy_crc = conf->filestore_sloppy_crc;
+ m_filestore_sloppy_crc_block_size = conf->filestore_sloppy_crc_block_size;
}
if (changed.count("filestore_commit_timeout")) {
Mutex::Locker l(sync_entry_timeo_lock);
@@ -4521,3 +4653,77 @@ void FileStore::dump_transactions(list<ObjectStore::Transaction*>& ls, uint64_t
m_filestore_dump_fmt.flush(m_filestore_dump);
m_filestore_dump.flush();
}
+
+void FileStore::set_xattr_limits_via_conf()
+{
+ uint32_t fs_xattr_size;
+ uint32_t fs_xattrs;
+
+ assert(m_fs_type != FS_TYPE_NONE);
+
+ switch(m_fs_type) {
+ case FS_TYPE_XFS:
+ fs_xattr_size = g_conf->filestore_max_inline_xattr_size_xfs;
+ fs_xattrs = g_conf->filestore_max_inline_xattrs_xfs;
+ break;
+ case FS_TYPE_BTRFS:
+ fs_xattr_size = g_conf->filestore_max_inline_xattr_size_btrfs;
+ fs_xattrs = g_conf->filestore_max_inline_xattrs_btrfs;
+ break;
+ case FS_TYPE_ZFS:
+ case FS_TYPE_OTHER:
+ fs_xattr_size = g_conf->filestore_max_inline_xattr_size_other;
+ fs_xattrs = g_conf->filestore_max_inline_xattrs_other;
+ break;
+ default:
+ assert(!"Unknown fs type");
+ }
+
+ //Use override value if set
+ if (g_conf->filestore_max_inline_xattr_size)
+ m_filestore_max_inline_xattr_size = g_conf->filestore_max_inline_xattr_size;
+ else
+ m_filestore_max_inline_xattr_size = fs_xattr_size;
+
+ //Use override value if set
+ if (g_conf->filestore_max_inline_xattrs)
+ m_filestore_max_inline_xattrs = g_conf->filestore_max_inline_xattrs;
+ else
+ m_filestore_max_inline_xattrs = fs_xattrs;
+}
+
+// -- FSSuperblock --
+
+void FSSuperblock::encode(bufferlist &bl) const
+{
+ ENCODE_START(1, 1, bl);
+ compat_features.encode(bl);
+ ENCODE_FINISH(bl);
+}
+
+void FSSuperblock::decode(bufferlist::iterator &bl)
+{
+ DECODE_START(1, bl);
+ compat_features.decode(bl);
+ DECODE_FINISH(bl);
+}
+
+void FSSuperblock::dump(Formatter *f) const
+{
+ f->open_object_section("compat");
+ compat_features.dump(f);
+ f->close_section();
+}
+
+void FSSuperblock::generate_test_instances(list<FSSuperblock*>& o)
+{
+ FSSuperblock z;
+ o.push_back(new FSSuperblock(z));
+ CompatSet::FeatureSet feature_compat;
+ CompatSet::FeatureSet feature_ro_compat;
+ CompatSet::FeatureSet feature_incompat;
+ feature_incompat.insert(CEPH_FS_FEATURE_INCOMPAT_SHARDS);
+ z.compat_features = CompatSet(feature_compat, feature_ro_compat,
+ feature_incompat);
+ o.push_back(new FSSuperblock(z));
+}
diff --git a/src/os/FileStore.h b/src/os/FileStore.h
index 4f58df4d698..c489fdd5796 100644
--- a/src/os/FileStore.h
+++ b/src/os/FileStore.h
@@ -64,8 +64,36 @@ static const __SWORD_TYPE XFS_SUPER_MAGIC(0x58465342);
static const __SWORD_TYPE ZFS_SUPER_MAGIC(0x2fc12fc1);
#endif
+enum fs_types {
+ FS_TYPE_NONE = 0,
+ FS_TYPE_XFS,
+ FS_TYPE_BTRFS,
+ FS_TYPE_ZFS,
+ FS_TYPE_OTHER
+};
+
class FileStoreBackend;
+#define CEPH_FS_FEATURE_INCOMPAT_SHARDS CompatSet::Feature(1, "sharded objects")
+
+class FSSuperblock {
+public:
+ CompatSet compat_features;
+
+ FSSuperblock() { }
+
+ void encode(bufferlist &bl) const;
+ void decode(bufferlist::iterator &bl);
+ void dump(Formatter *f) const;
+ static void generate_test_instances(list<FSSuperblock*>& o);
+};
+WRITE_CLASS_ENCODER(FSSuperblock)
+
+inline ostream& operator<<(ostream& out, const FSSuperblock& sb)
+{
+ return out << "sb(" << sb.compat_features << ")";
+}
+
class FileStore : public JournalingObjectStore,
public md_config_obs_t
{
@@ -89,7 +117,7 @@ public:
return perf_tracker.get_cur_stats();
}
- static const uint32_t on_disk_version = 3;
+ static const uint32_t target_version = 3;
private:
string internal_name; ///< internal name, used to name the perfcounter instance
string basedir, journalpath;
@@ -281,25 +309,26 @@ private:
void op_queue_release_throttle(Op *o);
void _journaled_ahead(OpSequencer *osr, Op *o, Context *ondisk);
friend struct C_JournaledAhead;
+ int write_version_stamp();
int open_journal();
PerfCounters *logger;
public:
- int lfn_find(coll_t cid, const hobject_t& oid, IndexedPath *path);
- int lfn_truncate(coll_t cid, const hobject_t& oid, off_t length);
- int lfn_stat(coll_t cid, const hobject_t& oid, struct stat *buf);
+ int lfn_find(coll_t cid, const ghobject_t& oid, IndexedPath *path);
+ int lfn_truncate(coll_t cid, const ghobject_t& oid, off_t length);
+ int lfn_stat(coll_t cid, const ghobject_t& oid, struct stat *buf);
int lfn_open(
coll_t cid,
- const hobject_t& oid,
+ const ghobject_t& oid,
bool create,
FDRef *outfd,
IndexedPath *path = 0,
Index *index = 0);
void lfn_close(FDRef fd);
- int lfn_link(coll_t c, coll_t newcid, const hobject_t& o, const hobject_t& newoid) ;
- int lfn_unlink(coll_t cid, const hobject_t& o, const SequencerPosition &spos,
+ int lfn_link(coll_t c, coll_t newcid, const ghobject_t& o, const ghobject_t& newoid) ;
+ int lfn_unlink(coll_t cid, const ghobject_t& o, const SequencerPosition &spos,
bool force_clear_omap=false);
public:
@@ -310,7 +339,6 @@ public:
int _sanity_check_fs();
bool test_mount_in_use();
- int write_version_stamp();
int version_stamp_is_valid(uint32_t *version);
int update_version_stamp();
int read_op_seq(uint64_t *seq);
@@ -321,6 +349,22 @@ public:
int mkfs();
int mkjournal();
+ /**
+ * set_allow_sharded_objects()
+ *
+ * Before sharded ghobject_t can be specified this function must be called
+ *
+ * Once this function is called the FileStore is not mountable by prior releases
+ */
+ void set_allow_sharded_objects();
+
+ /**
+ * get_allow_sharded_objects()
+ *
+ * return value: true if set_allow_sharded_objects() called, otherwise false
+ */
+ bool get_allow_sharded_objects();
+
int statfs(struct statfs *buf);
int _do_transactions(
@@ -329,7 +373,9 @@ public:
int do_transactions(list<Transaction*> &tls, uint64_t op_seq) {
return _do_transactions(tls, op_seq, 0);
}
- unsigned _do_transaction(Transaction& t, uint64_t op_seq, int trans_num);
+ unsigned _do_transaction(
+ Transaction& t, uint64_t op_seq, int trans_num,
+ ThreadPool::TPHandle *handle);
int queue_transactions(Sequencer *osr, list<Transaction*>& tls,
TrackedOpRef op = TrackedOpRef());
@@ -345,7 +391,7 @@ public:
*/
void _set_replay_guard(int fd,
const SequencerPosition& spos,
- const hobject_t *hoid=0,
+ const ghobject_t *oid=0,
bool in_progress=false);
void _set_replay_guard(coll_t cid,
const SequencerPosition& spos,
@@ -375,42 +421,42 @@ public:
*/
int _check_replay_guard(int fd, const SequencerPosition& spos);
int _check_replay_guard(coll_t cid, const SequencerPosition& spos);
- int _check_replay_guard(coll_t cid, hobject_t oid, const SequencerPosition& pos);
+ int _check_replay_guard(coll_t cid, ghobject_t oid, const SequencerPosition& pos);
int _check_global_replay_guard(coll_t cid, const SequencerPosition& spos);
// ------------------
// objects
- int pick_object_revision_lt(hobject_t& oid) {
+ int pick_object_revision_lt(ghobject_t& oid) {
return 0;
}
- bool exists(coll_t cid, const hobject_t& oid);
+ bool exists(coll_t cid, const ghobject_t& oid);
int stat(
coll_t cid,
- const hobject_t& oid,
+ const ghobject_t& oid,
struct stat *st,
bool allow_eio = false);
int read(
coll_t cid,
- const hobject_t& oid,
+ const ghobject_t& oid,
uint64_t offset,
size_t len,
bufferlist& bl,
bool allow_eio = false);
- int fiemap(coll_t cid, const hobject_t& oid, uint64_t offset, size_t len, bufferlist& bl);
+ int fiemap(coll_t cid, const ghobject_t& oid, uint64_t offset, size_t len, bufferlist& bl);
- int _touch(coll_t cid, const hobject_t& oid);
- int _write(coll_t cid, const hobject_t& oid, uint64_t offset, size_t len, const bufferlist& bl,
+ int _touch(coll_t cid, const ghobject_t& oid);
+ int _write(coll_t cid, const ghobject_t& oid, uint64_t offset, size_t len, const bufferlist& bl,
bool replica = false);
- int _zero(coll_t cid, const hobject_t& oid, uint64_t offset, size_t len);
- int _truncate(coll_t cid, const hobject_t& oid, uint64_t size);
- int _clone(coll_t cid, const hobject_t& oldoid, const hobject_t& newoid,
+ int _zero(coll_t cid, const ghobject_t& oid, uint64_t offset, size_t len);
+ int _truncate(coll_t cid, const ghobject_t& oid, uint64_t size);
+ int _clone(coll_t cid, const ghobject_t& oldoid, const ghobject_t& newoid,
const SequencerPosition& spos);
- int _clone_range(coll_t cid, const hobject_t& oldoid, const hobject_t& newoid,
+ int _clone_range(coll_t cid, const ghobject_t& oldoid, const ghobject_t& newoid,
uint64_t srcoff, uint64_t len, uint64_t dstoff,
const SequencerPosition& spos);
int _do_clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff);
int _do_copy_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff);
- int _remove(coll_t cid, const hobject_t& oid, const SequencerPosition &spos);
+ int _remove(coll_t cid, const ghobject_t& oid, const SequencerPosition &spos);
int _fgetattr(int fd, const char *name, bufferptr& bp);
int _fgetattrs(int fd, map<string,bufferptr>& aset, bool user_only);
@@ -434,25 +480,25 @@ public:
// DEBUG read error injection, an object is removed from both on delete()
Mutex read_error_lock;
- set<hobject_t> data_error_set; // read() will return -EIO
- set<hobject_t> mdata_error_set; // getattr(),stat() will return -EIO
- void inject_data_error(const hobject_t &oid);
- void inject_mdata_error(const hobject_t &oid);
- void debug_obj_on_delete(const hobject_t &oid);
- bool debug_data_eio(const hobject_t &oid);
- bool debug_mdata_eio(const hobject_t &oid);
+ set<ghobject_t> data_error_set; // read() will return -EIO
+ set<ghobject_t> mdata_error_set; // getattr(),stat() will return -EIO
+ void inject_data_error(const ghobject_t &oid);
+ void inject_mdata_error(const ghobject_t &oid);
+ void debug_obj_on_delete(const ghobject_t &oid);
+ bool debug_data_eio(const ghobject_t &oid);
+ bool debug_mdata_eio(const ghobject_t &oid);
int snapshot(const string& name);
// attrs
- int getattr(coll_t cid, const hobject_t& oid, const char *name, bufferptr &bp);
- int getattrs(coll_t cid, const hobject_t& oid, map<string,bufferptr>& aset, bool user_only = false);
+ int getattr(coll_t cid, const ghobject_t& oid, const char *name, bufferptr &bp);
+ int getattrs(coll_t cid, const ghobject_t& oid, map<string,bufferptr>& aset, bool user_only = false);
- int _setattrs(coll_t cid, const hobject_t& oid, map<string,bufferptr>& aset,
+ int _setattrs(coll_t cid, const ghobject_t& oid, map<string,bufferptr>& aset,
const SequencerPosition &spos);
- int _rmattr(coll_t cid, const hobject_t& oid, const char *name,
+ int _rmattr(coll_t cid, const ghobject_t& oid, const char *name,
const SequencerPosition &spos);
- int _rmattrs(coll_t cid, const hobject_t& oid,
+ int _rmattrs(coll_t cid, const ghobject_t& oid,
const SequencerPosition &spos);
int collection_getattr(coll_t c, const char *name, void *value, size_t size);
@@ -473,35 +519,35 @@ public:
int collection_stat(coll_t c, struct stat *st);
bool collection_exists(coll_t c);
bool collection_empty(coll_t c);
- int collection_list(coll_t c, vector<hobject_t>& o);
- int collection_list_partial(coll_t c, hobject_t start,
+ int collection_list(coll_t c, vector<ghobject_t>& oid);
+ int collection_list_partial(coll_t c, ghobject_t start,
int min, int max, snapid_t snap,
- vector<hobject_t> *ls, hobject_t *next);
- int collection_list_range(coll_t c, hobject_t start, hobject_t end,
- snapid_t seq, vector<hobject_t> *ls);
+ vector<ghobject_t> *ls, ghobject_t *next);
+ int collection_list_range(coll_t c, ghobject_t start, ghobject_t end,
+ snapid_t seq, vector<ghobject_t> *ls);
// omap (see ObjectStore.h for documentation)
- int omap_get(coll_t c, const hobject_t &hoid, bufferlist *header,
+ int omap_get(coll_t c, const ghobject_t &oid, bufferlist *header,
map<string, bufferlist> *out);
int omap_get_header(
coll_t c,
- const hobject_t &hoid,
+ const ghobject_t &oid,
bufferlist *out,
bool allow_eio = false);
- int omap_get_keys(coll_t c, const hobject_t &hoid, set<string> *keys);
- int omap_get_values(coll_t c, const hobject_t &hoid, const set<string> &keys,
+ int omap_get_keys(coll_t c, const ghobject_t &oid, set<string> *keys);
+ int omap_get_values(coll_t c, const ghobject_t &oid, const set<string> &keys,
map<string, bufferlist> *out);
- int omap_check_keys(coll_t c, const hobject_t &hoid, const set<string> &keys,
+ int omap_check_keys(coll_t c, const ghobject_t &oid, const set<string> &keys,
set<string> *out);
- ObjectMap::ObjectMapIterator get_omap_iterator(coll_t c, const hobject_t &hoid);
+ ObjectMap::ObjectMapIterator get_omap_iterator(coll_t c, const ghobject_t &oid);
int _create_collection(coll_t c);
int _create_collection(coll_t c, const SequencerPosition &spos);
int _destroy_collection(coll_t c);
- int _collection_add(coll_t c, coll_t ocid, const hobject_t& o,
+ int _collection_add(coll_t c, coll_t ocid, const ghobject_t& oid,
const SequencerPosition& spos);
- int _collection_move_rename(coll_t oldcid, const hobject_t& oldoid,
- coll_t c, const hobject_t& o,
+ int _collection_move_rename(coll_t oldcid, const ghobject_t& oldoid,
+ coll_t c, const ghobject_t& o,
const SequencerPosition& spos);
void dump_start(const std::string& file);
void dump_stop();
@@ -511,17 +557,17 @@ private:
void _inject_failure();
// omap
- int _omap_clear(coll_t cid, const hobject_t &hoid,
+ int _omap_clear(coll_t cid, const ghobject_t &oid,
const SequencerPosition &spos);
- int _omap_setkeys(coll_t cid, const hobject_t &hoid,
+ int _omap_setkeys(coll_t cid, const ghobject_t &oid,
const map<string, bufferlist> &aset,
const SequencerPosition &spos);
- int _omap_rmkeys(coll_t cid, const hobject_t &hoid, const set<string> &keys,
+ int _omap_rmkeys(coll_t cid, const ghobject_t &oid, const set<string> &keys,
const SequencerPosition &spos);
- int _omap_rmkeyrange(coll_t cid, const hobject_t &hoid,
+ int _omap_rmkeyrange(coll_t cid, const ghobject_t &oid,
const string& first, const string& last,
const SequencerPosition &spos);
- int _omap_setheader(coll_t cid, const hobject_t &hoid, const bufferlist &bl,
+ int _omap_setheader(coll_t cid, const ghobject_t &oid, const bufferlist &bl,
const SequencerPosition &spos);
int _split_collection(coll_t cid, uint32_t bits, uint32_t rem, coll_t dest,
const SequencerPosition &spos);
@@ -553,6 +599,34 @@ private:
std::ofstream m_filestore_dump;
JSONFormatter m_filestore_dump_fmt;
atomic_t m_filestore_kill_at;
+ bool m_filestore_sloppy_crc;
+ int m_filestore_sloppy_crc_block_size;
+ enum fs_types m_fs_type;
+
+ //Determined xattr handling based on fs type
+ void set_xattr_limits_via_conf();
+ uint32_t m_filestore_max_inline_xattr_size;
+ uint32_t m_filestore_max_inline_xattrs;
+
+ FSSuperblock superblock;
+
+ /**
+ * write_superblock()
+ *
+ * Write superblock to persisent storage
+ *
+ * return value: 0 on success, otherwise negative errno
+ */
+ int write_superblock();
+
+ /**
+ * read_superblock()
+ *
+ * Fill in FileStore::superblock by reading persistent storage
+ *
+ * return value: 0 on success, otherwise negative errno
+ */
+ int read_superblock();
friend class FileStoreBackend;
};
@@ -586,6 +660,9 @@ protected:
int _copy_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff) {
return filestore->_do_copy_range(from, to, srcoff, len, dstoff);
}
+ int get_crc_block_size() {
+ return filestore->m_filestore_sloppy_crc_block_size;
+ }
public:
FileStoreBackend(FileStore *fs) : filestore(fs) {}
virtual ~FileStoreBackend() {};
@@ -601,6 +678,15 @@ public:
virtual bool has_fiemap() = 0;
virtual int do_fiemap(int fd, off_t start, size_t len, struct fiemap **pfiemap) = 0;
virtual int clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff) = 0;
+
+ // hooks for (sloppy) crc tracking
+ virtual int _crc_update_write(int fd, loff_t off, size_t len, const bufferlist& bl) = 0;
+ virtual int _crc_update_truncate(int fd, loff_t off) = 0;
+ virtual int _crc_update_zero(int fd, loff_t off, size_t len) = 0;
+ virtual int _crc_update_clone_range(int srcfd, int destfd,
+ loff_t srcoff, size_t len, loff_t dstoff) = 0;
+ virtual int _crc_verify_read(int fd, loff_t off, size_t len, const bufferlist& bl,
+ ostream *out) = 0;
};
#endif
diff --git a/src/os/FlatIndex.cc b/src/os/FlatIndex.cc
index db46750e411..d4644abc627 100644
--- a/src/os/FlatIndex.cc
+++ b/src/os/FlatIndex.cc
@@ -134,18 +134,18 @@ static void lfn_translate(const char *path, const char *name, char *new_name, in
return;
}
-static int append_oname(const hobject_t &oid, char *s, int len)
+static int append_oname(const ghobject_t &oid, char *s, int len)
{
//assert(sizeof(oid) == 28);
char *end = s + len;
char *t = s + strlen(s);
- const char *i = oid.oid.name.c_str();
+ const char *i = oid.hobj.oid.name.c_str();
while (*i && t < end) {
if (*i == '\\') {
*t++ = '\\';
*t++ = '\\';
- } else if (*i == '.' && i == oid.oid.name.c_str()) { // only escape leading .
+ } else if (*i == '.' && i == oid.hobj.oid.name.c_str()) { // only escape leading .
*t++ = '\\';
*t++ = '.';
} else if (*i == '/') {
@@ -158,17 +158,17 @@ static int append_oname(const hobject_t &oid, char *s, int len)
int size = t - s;
- if (oid.snap == CEPH_NOSNAP)
+ if (oid.hobj.snap == CEPH_NOSNAP)
size += snprintf(t, end - t, "_head");
- else if (oid.snap == CEPH_SNAPDIR)
+ else if (oid.hobj.snap == CEPH_SNAPDIR)
size += snprintf(t, end - t, "_snapdir");
else
- size += snprintf(t, end - t, "_%llx", (long long unsigned)oid.snap);
+ size += snprintf(t, end - t, "_%llx", (long long unsigned)oid.hobj.snap);
return size;
}
-static bool parse_object(char *s, hobject_t& oid)
+static bool parse_object(char *s, ghobject_t& oid)
{
sobject_t o;
char *bar = s + strlen(s) - 1;
@@ -201,13 +201,13 @@ static bool parse_object(char *s, hobject_t& oid)
o.snap = CEPH_SNAPDIR;
else
o.snap = strtoull(bar+1, &s, 16);
- oid = hobject_t(o);
+ oid = ghobject_t(hobject_t(o));
return true;
}
return false;
}
-static int lfn_get(const char *coll_path, const hobject_t& oid, char *pathname, int len, char *lfn, int lfn_len, int *exist, int *is_lfn)
+static int lfn_get(const char *coll_path, const ghobject_t& oid, char *pathname, int len, char *lfn, int lfn_len, int *exist, int *is_lfn)
{
int i = 0;
strncpy(pathname, coll_path, len);
@@ -277,7 +277,7 @@ int FlatIndex::init() {
return 0;
}
-int FlatIndex::created(const hobject_t &hoid, const char *path) {
+int FlatIndex::created(const ghobject_t &hoid, const char *path) {
char long_name[PATH_MAX];
long_name[0] = '\0';
int actual_len = append_oname(hoid, long_name, sizeof(long_name));
@@ -292,7 +292,7 @@ int FlatIndex::created(const hobject_t &hoid, const char *path) {
return 0;
}
-int FlatIndex::unlink(const hobject_t &o) {
+int FlatIndex::unlink(const ghobject_t &o) {
char long_fn[PATH_MAX];
char short_fn[PATH_MAX];
char short_fn2[PATH_MAX];
@@ -346,7 +346,7 @@ int FlatIndex::unlink(const hobject_t &o) {
return 0;
}
-int FlatIndex::lookup(const hobject_t &hoid, IndexedPath *path, int *exist) {
+int FlatIndex::lookup(const ghobject_t &hoid, IndexedPath *path, int *exist) {
char long_fn[PATH_MAX];
char short_fn[PATH_MAX];
int r;
@@ -361,7 +361,7 @@ int FlatIndex::lookup(const hobject_t &hoid, IndexedPath *path, int *exist) {
}
static int get_hobject_from_oinfo(const char *dir, const char *file,
- hobject_t *o) {
+ ghobject_t *o) {
char path[PATH_MAX];
bufferptr bp(PATH_MAX);
snprintf(path, sizeof(path), "%s/%s", dir, file);
@@ -376,17 +376,17 @@ static int get_hobject_from_oinfo(const char *dir, const char *file,
return 0;
}
-int FlatIndex::collection_list_partial(const hobject_t &start,
+int FlatIndex::collection_list_partial(const ghobject_t &start,
int min_count,
int max_count,
snapid_t seq,
- vector<hobject_t> *ls,
- hobject_t *next) {
+ vector<ghobject_t> *ls,
+ ghobject_t *next) {
assert(0); // Should not be called
return 0;
}
-int FlatIndex::collection_list(vector<hobject_t> *ls) {
+int FlatIndex::collection_list(vector<ghobject_t> *ls) {
char buf[offsetof(struct dirent, d_name) + PATH_MAX + 1];
char dir_name[PATH_MAX], new_name[PATH_MAX];
strncpy(dir_name, base_path.c_str(), sizeof(dir_name));
@@ -397,7 +397,7 @@ int FlatIndex::collection_list(vector<hobject_t> *ls) {
return -errno;
// first, build (ino, object) list
- vector< pair<ino_t,hobject_t> > inolist;
+ vector< pair<ino_t,ghobject_t> > inolist;
struct dirent *de;
while (::readdir_r(dir, (struct dirent *)buf, &de) == 0) {
@@ -407,11 +407,11 @@ int FlatIndex::collection_list(vector<hobject_t> *ls) {
if (de->d_name[0] == '.')
continue;
//cout << " got object " << de->d_name << std::endl;
- hobject_t o;
+ ghobject_t o;
lfn_translate(dir_name, de->d_name, new_name, sizeof(new_name));
if (parse_object(new_name, o)) {
get_hobject_from_oinfo(dir_name, de->d_name, &o);
- inolist.push_back(pair<ino_t,hobject_t>(de->d_ino, o));
+ inolist.push_back(pair<ino_t,ghobject_t>(de->d_ino, o));
ls->push_back(o);
}
}
@@ -422,7 +422,7 @@ int FlatIndex::collection_list(vector<hobject_t> *ls) {
// build final list
ls->resize(inolist.size());
int i = 0;
- for (vector< pair<ino_t,hobject_t> >::iterator p = inolist.begin(); p != inolist.end(); ++p)
+ for (vector< pair<ino_t,ghobject_t> >::iterator p = inolist.begin(); p != inolist.end(); ++p)
(*ls)[i++].swap(p->second);
::closedir(dir);
diff --git a/src/os/FlatIndex.h b/src/os/FlatIndex.h
index 7a10912dc28..657c273468b 100644
--- a/src/os/FlatIndex.h
+++ b/src/os/FlatIndex.h
@@ -52,35 +52,35 @@ public:
/// @see CollectionIndex
int created(
- const hobject_t &hoid,
+ const ghobject_t &oid,
const char *path
);
/// @see CollectionIndex
int unlink(
- const hobject_t &hoid
+ const ghobject_t &oid
);
/// @see CollectionIndex
int lookup(
- const hobject_t &hoid,
+ const ghobject_t &oid,
IndexedPath *path,
int *exist
);
/// @see CollectionIndex
int collection_list(
- vector<hobject_t> *ls
+ vector<ghobject_t> *ls
);
/// @see CollectionIndex
int collection_list_partial(
- const hobject_t &start,
+ const ghobject_t &start,
int min_count,
int max_count,
snapid_t seq,
- vector<hobject_t> *ls,
- hobject_t *next
+ vector<ghobject_t> *ls,
+ ghobject_t *next
);
};
diff --git a/src/os/GenericFileStoreBackend.cc b/src/os/GenericFileStoreBackend.cc
index 461158fdfab..81d896a0943 100644
--- a/src/os/GenericFileStoreBackend.cc
+++ b/src/os/GenericFileStoreBackend.cc
@@ -40,6 +40,12 @@
#include "common/config.h"
#include "common/sync_filesystem.h"
+#include "common/SloppyCRCMap.h"
+#include "os/chain_xattr.h"
+
+#define SLOPPY_CRC_XATTR "user.cephos.scrc"
+
+
#define dout_subsys ceph_subsys_filestore
#undef dout_prefix
#define dout_prefix *_dout << "genericfilestorebackend(" << get_basedir_path() << ") "
@@ -251,3 +257,110 @@ done_err:
free(fiemap);
return ret;
}
+
+
+int GenericFileStoreBackend::_crc_load_or_init(int fd, SloppyCRCMap *cm)
+{
+ char buf[100];
+ bufferptr bp;
+ int r = 0;
+ int l = chain_fgetxattr(fd, SLOPPY_CRC_XATTR, buf, sizeof(buf));
+ if (l == -ENODATA) {
+ return 0;
+ }
+ if (l >= 0) {
+ bp = buffer::create(l);
+ memcpy(bp.c_str(), buf, l);
+ } else if (l == -ERANGE) {
+ l = chain_fgetxattr(fd, SLOPPY_CRC_XATTR, 0, 0);
+ if (l > 0) {
+ bp = buffer::create(l);
+ l = chain_fgetxattr(fd, SLOPPY_CRC_XATTR, bp.c_str(), l);
+ }
+ }
+ bufferlist bl;
+ bl.append(bp);
+ bufferlist::iterator p = bl.begin();
+ try {
+ ::decode(*cm, p);
+ }
+ catch (buffer::error &e) {
+ r = -EIO;
+ }
+ if (r < 0)
+ derr << __func__ << " got " << cpp_strerror(r) << dendl;
+ return r;
+}
+
+int GenericFileStoreBackend::_crc_save(int fd, SloppyCRCMap *cm)
+{
+ bufferlist bl;
+ ::encode(*cm, bl);
+ int r = chain_fsetxattr(fd, SLOPPY_CRC_XATTR, bl.c_str(), bl.length());
+ if (r < 0)
+ derr << __func__ << " got " << cpp_strerror(r) << dendl;
+ return r;
+}
+
+int GenericFileStoreBackend::_crc_update_write(int fd, loff_t off, size_t len, const bufferlist& bl)
+{
+ SloppyCRCMap scm(get_crc_block_size());
+ int r = _crc_load_or_init(fd, &scm);
+ if (r < 0)
+ return r;
+ ostringstream ss;
+ scm.write(off, len, bl, &ss);
+ dout(30) << __func__ << "\n" << ss.str() << dendl;
+ r = _crc_save(fd, &scm);
+ return r;
+}
+
+int GenericFileStoreBackend::_crc_update_truncate(int fd, loff_t off)
+{
+ SloppyCRCMap scm(get_crc_block_size());
+ int r = _crc_load_or_init(fd, &scm);
+ if (r < 0)
+ return r;
+ scm.truncate(off);
+ r = _crc_save(fd, &scm);
+ return r;
+}
+
+int GenericFileStoreBackend::_crc_update_zero(int fd, loff_t off, size_t len)
+{
+ SloppyCRCMap scm(get_crc_block_size());
+ int r = _crc_load_or_init(fd, &scm);
+ if (r < 0)
+ return r;
+ scm.zero(off, len);
+ r = _crc_save(fd, &scm);
+ return r;
+}
+
+int GenericFileStoreBackend::_crc_update_clone_range(int srcfd, int destfd,
+ loff_t srcoff, size_t len, loff_t dstoff)
+{
+ SloppyCRCMap scm_src(get_crc_block_size());
+ SloppyCRCMap scm_dst(get_crc_block_size());
+ int r = _crc_load_or_init(srcfd, &scm_src);
+ if (r < 0)
+ return r;
+ r = _crc_load_or_init(destfd, &scm_dst);
+ if (r < 0)
+ return r;
+ ostringstream ss;
+ scm_dst.clone_range(srcoff, len, dstoff, scm_src, &ss);
+ dout(30) << __func__ << "\n" << ss.str() << dendl;
+ r = _crc_save(destfd, &scm_dst);
+ return r;
+}
+
+int GenericFileStoreBackend::_crc_verify_read(int fd, loff_t off, size_t len, const bufferlist& bl,
+ ostream *out)
+{
+ SloppyCRCMap scm(get_crc_block_size());
+ int r = _crc_load_or_init(fd, &scm);
+ if (r < 0)
+ return r;
+ return scm.read(off, len, bl, out);
+}
diff --git a/src/os/GenericFileStoreBackend.h b/src/os/GenericFileStoreBackend.h
index 95aca971708..5a09c2497a8 100644
--- a/src/os/GenericFileStoreBackend.h
+++ b/src/os/GenericFileStoreBackend.h
@@ -17,6 +17,8 @@
#include "FileStore.h"
+class SloppyCRCMap;
+
class GenericFileStoreBackend : public FileStoreBackend {
private:
bool ioctl_fiemap;
@@ -25,6 +27,7 @@ private:
public:
GenericFileStoreBackend(FileStore *fs);
virtual ~GenericFileStoreBackend() {};
+
virtual int detect_features();
virtual int create_current();
virtual bool can_checkpoint() { return false; };
@@ -39,5 +42,17 @@ public:
virtual int clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff) {
return _copy_range(from, to, srcoff, len, dstoff);
}
+
+private:
+ int _crc_load_or_init(int fd, SloppyCRCMap *cm);
+ int _crc_save(int fd, SloppyCRCMap *cm);
+public:
+ virtual int _crc_update_write(int fd, loff_t off, size_t len, const bufferlist& bl);
+ virtual int _crc_update_truncate(int fd, loff_t off);
+ virtual int _crc_update_zero(int fd, loff_t off, size_t len);
+ virtual int _crc_update_clone_range(int srcfd, int destfd,
+ loff_t srcoff, size_t len, loff_t dstoff);
+ virtual int _crc_verify_read(int fd, loff_t off, size_t len, const bufferlist& bl,
+ ostream *out);
};
#endif
diff --git a/src/os/HashIndex.cc b/src/os/HashIndex.cc
index c279bab3a60..ea50cd038ca 100644
--- a/src/os/HashIndex.cc
+++ b/src/os/HashIndex.cc
@@ -66,7 +66,7 @@ int HashIndex::reset_attr(
return r;
if (!exists)
return 0;
- map<string, hobject_t> objects;
+ map<string, ghobject_t> objects;
set<string> subdirs;
r = list_objects(path, 0, 0, &objects);
if (r < 0)
@@ -98,7 +98,7 @@ int HashIndex::col_split_level(
int r = from.list_subdirs(path, &subdirs);
if (r < 0)
return r;
- map<string, hobject_t> objects;
+ map<string, ghobject_t> objects;
r = from.list_objects(path, 0, 0, &objects);
if (r < 0)
return r;
@@ -134,8 +134,8 @@ int HashIndex::col_split_level(
}
/* Then, do the same for each object */
- map<string, hobject_t> objs_to_move;
- for (map<string, hobject_t>::iterator i = objects.begin();
+ map<string, ghobject_t> objs_to_move;
+ for (map<string, ghobject_t>::iterator i = objects.begin();
i != objects.end();
++i) {
if (i->second.match(inbits, match)) {
@@ -199,7 +199,7 @@ int HashIndex::col_split_level(
return r;
}
- for (map<string, hobject_t>::iterator i = objs_to_move.begin();
+ for (map<string, ghobject_t>::iterator i = objs_to_move.begin();
i != objs_to_move.end();
++i) {
from_info.objs--;
@@ -244,7 +244,7 @@ int HashIndex::_init() {
/* LFNIndex virtual method implementations */
int HashIndex::_created(const vector<string> &path,
- const hobject_t &hoid,
+ const ghobject_t &oid,
const string &mangled_name) {
subdir_info_s info;
int r;
@@ -267,10 +267,10 @@ int HashIndex::_created(const vector<string> &path,
}
int HashIndex::_remove(const vector<string> &path,
- const hobject_t &hoid,
+ const ghobject_t &oid,
const string &mangled_name) {
int r;
- r = remove_object(path, hoid);
+ r = remove_object(path, oid);
if (r < 0)
return r;
subdir_info_s info;
@@ -291,12 +291,12 @@ int HashIndex::_remove(const vector<string> &path,
}
}
-int HashIndex::_lookup(const hobject_t &hoid,
+int HashIndex::_lookup(const ghobject_t &oid,
vector<string> *path,
string *mangled_name,
int *exists_out) {
vector<string> path_comp;
- get_path_components(hoid, &path_comp);
+ get_path_components(oid, &path_comp);
vector<string>::iterator next = path_comp.begin();
int exists;
while (1) {
@@ -313,22 +313,22 @@ int HashIndex::_lookup(const hobject_t &hoid,
break;
path->push_back(*(next++));
}
- return get_mangled_name(*path, hoid, mangled_name, exists_out);
+ return get_mangled_name(*path, oid, mangled_name, exists_out);
}
-int HashIndex::_collection_list(vector<hobject_t> *ls) {
+int HashIndex::_collection_list(vector<ghobject_t> *ls) {
vector<string> path;
return list_by_hash(path, 0, 0, 0, 0, ls);
}
-int HashIndex::_collection_list_partial(const hobject_t &start,
+int HashIndex::_collection_list_partial(const ghobject_t &start,
int min_count,
int max_count,
snapid_t seq,
- vector<hobject_t> *ls,
- hobject_t *next) {
+ vector<ghobject_t> *ls,
+ ghobject_t *next) {
vector<string> path;
- hobject_t _next;
+ ghobject_t _next;
if (!next)
next = &_next;
*next = start;
@@ -345,7 +345,7 @@ int HashIndex::recursive_remove(const vector<string> &path) {
int r = list_subdirs(path, &subdirs);
if (r < 0)
return r;
- map<string, hobject_t> objects;
+ map<string, ghobject_t> objects;
r = list_objects(path, 0, 0, &objects);
if (r < 0)
return r;
@@ -475,7 +475,7 @@ int HashIndex::initiate_split(const vector<string> &path, subdir_info_s info) {
int HashIndex::complete_split(const vector<string> &path, subdir_info_s info) {
int level = info.hash_level;
- map<string, hobject_t> objects;
+ map<string, ghobject_t> objects;
vector<string> dst = path;
int r;
dst.push_back("");
@@ -486,17 +486,17 @@ int HashIndex::complete_split(const vector<string> &path, subdir_info_s info) {
r = list_subdirs(path, &subdirs);
if (r < 0)
return r;
- map<string, map<string, hobject_t> > mapped;
- map<string, hobject_t> moved;
+ map<string, map<string, ghobject_t> > mapped;
+ map<string, ghobject_t> moved;
int num_moved = 0;
- for (map<string, hobject_t>::iterator i = objects.begin();
+ for (map<string, ghobject_t>::iterator i = objects.begin();
i != objects.end();
++i) {
vector<string> new_path;
get_path_components(i->second, &new_path);
mapped[new_path[level]][i->first] = i->second;
}
- for (map<string, map<string, hobject_t> >::iterator i = mapped.begin();
+ for (map<string, map<string, ghobject_t> >::iterator i = mapped.begin();
i != mapped.end();
) {
dst[level] = i->first;
@@ -505,7 +505,7 @@ int HashIndex::complete_split(const vector<string> &path, subdir_info_s info) {
subdir_info_s temp;
// subdir has already been fully copied
if (subdirs.count(i->first) && !get_info(dst, &temp)) {
- for (map<string, hobject_t>::iterator j = i->second.begin();
+ for (map<string, ghobject_t>::iterator j = i->second.begin();
j != i->second.end();
++j) {
moved[j->first] = j->second;
@@ -533,7 +533,7 @@ int HashIndex::complete_split(const vector<string> &path, subdir_info_s info) {
return r;
} // else subdir has been created but only partially copied
- for (map<string, hobject_t>::iterator j = i->second.begin();
+ for (map<string, ghobject_t>::iterator j = i->second.begin();
j != i->second.end();
++j) {
moved[j->first] = j->second;
@@ -574,12 +574,12 @@ int HashIndex::complete_split(const vector<string> &path, subdir_info_s info) {
return end_split_or_merge(path);
}
-void HashIndex::get_path_components(const hobject_t &hoid,
+void HashIndex::get_path_components(const ghobject_t &oid,
vector<string> *path) {
char buf[MAX_HASH_LEVEL + 1];
- snprintf(buf, sizeof(buf), "%.*X", MAX_HASH_LEVEL, (uint32_t)hoid.get_filestore_key());
+ snprintf(buf, sizeof(buf), "%.*X", MAX_HASH_LEVEL, (uint32_t)oid.hobj.get_filestore_key());
- // Path components are the hex characters of hoid.hash, least
+ // Path components are the hex characters of oid.hobj.hash, least
// significant first
for (int i = 0; i < MAX_HASH_LEVEL; ++i) {
path->push_back(string(&buf[i], 1));
@@ -596,9 +596,9 @@ string HashIndex::get_hash_str(uint32_t hash) {
return retval;
}
-string HashIndex::get_path_str(const hobject_t &hoid) {
- assert(!hoid.is_max());
- return get_hash_str(hoid.hash);
+string HashIndex::get_path_str(const ghobject_t &oid) {
+ assert(!oid.is_max());
+ return get_hash_str(oid.hobj.hash);
}
uint32_t HashIndex::hash_prefix_to_hash(string prefix) {
@@ -616,12 +616,12 @@ uint32_t HashIndex::hash_prefix_to_hash(string prefix) {
int HashIndex::get_path_contents_by_hash(const vector<string> &path,
const string *lower_bound,
- const hobject_t *next_object,
+ const ghobject_t *next_object,
const snapid_t *seq,
set<string> *hash_prefixes,
- set<pair<string, hobject_t> > *objects) {
+ set<pair<string, ghobject_t> > *objects) {
set<string> subdirs;
- map<string, hobject_t> rev_objects;
+ map<string, ghobject_t> rev_objects;
int r;
string cur_prefix;
for (vector<string>::const_iterator i = path.begin();
@@ -632,7 +632,7 @@ int HashIndex::get_path_contents_by_hash(const vector<string> &path,
r = list_objects(path, 0, 0, &rev_objects);
if (r < 0)
return r;
- for (map<string, hobject_t>::iterator i = rev_objects.begin();
+ for (map<string, ghobject_t>::iterator i = rev_objects.begin();
i != rev_objects.end();
++i) {
string hash_prefix = get_path_str(i->second);
@@ -640,10 +640,10 @@ int HashIndex::get_path_contents_by_hash(const vector<string> &path,
continue;
if (next_object && i->second < *next_object)
continue;
- if (seq && i->second.snap < *seq)
+ if (seq && i->second.hobj.snap < *seq)
continue;
hash_prefixes->insert(hash_prefix);
- objects->insert(pair<string, hobject_t>(hash_prefix, i->second));
+ objects->insert(pair<string, ghobject_t>(hash_prefix, i->second));
}
r = list_subdirs(path, &subdirs);
if (r < 0)
@@ -667,13 +667,13 @@ int HashIndex::list_by_hash(const vector<string> &path,
int min_count,
int max_count,
snapid_t seq,
- hobject_t *next,
- vector<hobject_t> *out) {
+ ghobject_t *next,
+ vector<ghobject_t> *out) {
assert(out);
vector<string> next_path = path;
next_path.push_back("");
set<string> hash_prefixes;
- set<pair<string, hobject_t> > objects;
+ set<pair<string, ghobject_t> > objects;
int r = get_path_contents_by_hash(path,
NULL,
next,
@@ -686,16 +686,16 @@ int HashIndex::list_by_hash(const vector<string> &path,
for (set<string>::iterator i = hash_prefixes.begin();
i != hash_prefixes.end();
++i) {
- set<pair<string, hobject_t> >::iterator j = objects.lower_bound(
- make_pair(*i, hobject_t()));
+ set<pair<string, ghobject_t> >::iterator j = objects.lower_bound(
+ make_pair(*i, ghobject_t()));
if (j == objects.end() || j->first != *i) {
if (min_count > 0 && out->size() > (unsigned)min_count) {
if (next)
- *next = hobject_t("", "", CEPH_NOSNAP, hash_prefix_to_hash(*i), -1, "");
+ *next = ghobject_t(hobject_t("", "", CEPH_NOSNAP, hash_prefix_to_hash(*i), -1, ""));
return 0;
}
*(next_path.rbegin()) = *(i->rbegin());
- hobject_t next_recurse;
+ ghobject_t next_recurse;
if (next)
next_recurse = *next;
r = list_by_hash(next_path,
@@ -727,6 +727,6 @@ int HashIndex::list_by_hash(const vector<string> &path,
}
}
if (next)
- *next = hobject_t::get_max();
+ *next = ghobject_t(hobject_t::get_max());
return 0;
}
diff --git a/src/os/HashIndex.h b/src/os/HashIndex.h
index fcabd9f7198..6f5bca077d4 100644
--- a/src/os/HashIndex.h
+++ b/src/os/HashIndex.h
@@ -39,7 +39,7 @@
* given by the hex characters in the hash beginning with the least
* significant.
*
- * ex: hobject_t("object", CEPH_NO_SNAP, 0xA4CEE0D2)
+ * ex: ghobject_t("object", CEPH_NO_SNAP, 0xA4CEE0D2)
* would be located in (root)/2/D/0/
*
* Subdirectories are created when the number of objects in a directory
@@ -163,30 +163,30 @@ protected:
int _created(
const vector<string> &path,
- const hobject_t &hoid,
+ const ghobject_t &oid,
const string &mangled_name
);
int _remove(
const vector<string> &path,
- const hobject_t &hoid,
+ const ghobject_t &oid,
const string &mangled_name
);
int _lookup(
- const hobject_t &hoid,
+ const ghobject_t &oid,
vector<string> *path,
string *mangled_name,
int *exists
);
int _collection_list(
- vector<hobject_t> *ls
+ vector<ghobject_t> *ls
);
int _collection_list_partial(
- const hobject_t &start,
+ const ghobject_t &start,
int min_count,
int max_count,
snapid_t seq,
- vector<hobject_t> *ls,
- hobject_t *next
+ vector<ghobject_t> *ls,
+ ghobject_t *next
);
private:
/// Recursively remove path and its subdirs
@@ -262,7 +262,7 @@ private:
/// Determine path components from hoid hash
void get_path_components(
- const hobject_t &hoid, ///< [in] Object for which to get path components
+ const ghobject_t &oid, ///< [in] Object for which to get path components
vector<string> *path ///< [out] Path components for hoid.
);
@@ -278,12 +278,12 @@ private:
/**
- * Get string representation of hobject_t/hash
+ * Get string representation of ghobject_t/hash
*
* e.g: 0x01234567 -> "76543210"
*/
static string get_path_str(
- const hobject_t &hoid ///< [in] Object to get hash string for
+ const ghobject_t &oid ///< [in] Object to get hash string for
); ///< @return Hash string for hoid.
/// Get string from hash, @see get_path_str
@@ -319,20 +319,20 @@ private:
int get_path_contents_by_hash(
const vector<string> &path, /// [in] Path to list
const string *lower_bound, /// [in] list > *lower_bound
- const hobject_t *next_object, /// [in] list > *next_object
+ const ghobject_t *next_object, /// [in] list > *next_object
const snapid_t *seq, /// [in] list >= *seq
set<string> *hash_prefixes, /// [out] prefixes in dir
- set<pair<string, hobject_t> > *objects /// [out] objects
+ set<pair<string, ghobject_t> > *objects /// [out] objects
);
- /// List objects in collection in hobject_t order
+ /// List objects in collection in ghobject_t order
int list_by_hash(
const vector<string> &path, /// [in] Path to list
int min_count, /// [in] List at least min_count
int max_count, /// [in] List at most max_count
snapid_t seq, /// [in] list only objects where snap >= seq
- hobject_t *next, /// [in,out] List objects >= *next
- vector<hobject_t> *out /// [out] Listed objects
+ ghobject_t *next, /// [in,out] List objects >= *next
+ vector<ghobject_t> *out /// [out] Listed objects
); ///< @return Error Code, 0 on success
};
diff --git a/src/os/IndexManager.cc b/src/os/IndexManager.cc
index 412721a04c8..83bbfc9703e 100644
--- a/src/os/IndexManager.cc
+++ b/src/os/IndexManager.cc
@@ -75,7 +75,7 @@ int IndexManager::init_index(coll_t c, const char *path, uint32_t version) {
return r;
HashIndex index(c, path, g_conf->filestore_merge_threshold,
g_conf->filestore_split_multiple,
- CollectionIndex::HASH_INDEX_TAG_2,
+ version,
g_conf->filestore_index_retry_probability);
return index.init();
}
diff --git a/src/os/LFNIndex.cc b/src/os/LFNIndex.cc
index 029e8ad8197..83e1c144754 100644
--- a/src/os/LFNIndex.cc
+++ b/src/os/LFNIndex.cc
@@ -73,7 +73,7 @@ int LFNIndex::init()
return _init();
}
-int LFNIndex::created(const hobject_t &hoid, const char *path)
+int LFNIndex::created(const ghobject_t &oid, const char *path)
{
WRAP_RETRY(
vector<string> path_comp;
@@ -81,38 +81,39 @@ int LFNIndex::created(const hobject_t &hoid, const char *path)
r = decompose_full_path(path, &path_comp, 0, &short_name);
if (r < 0)
goto out;
- r = lfn_created(path_comp, hoid, short_name);
+ r = lfn_created(path_comp, oid, short_name);
if (r < 0)
goto out;
- r = _created(path_comp, hoid, short_name);
+ r = _created(path_comp, oid, short_name);
if (r < 0)
goto out;
);
}
-int LFNIndex::unlink(const hobject_t &hoid)
+int LFNIndex::unlink(const ghobject_t &oid)
{
WRAP_RETRY(
vector<string> path;
string short_name;
- r = _lookup(hoid, &path, &short_name, NULL);
+ r = _lookup(oid, &path, &short_name, NULL);
if (r < 0) {
goto out;
}
- r = _remove(path, hoid, short_name);
+ r = _remove(path, oid, short_name);
if (r < 0) {
goto out;
}
);
}
-int LFNIndex::lookup(const hobject_t &hoid,
+int LFNIndex::lookup(const ghobject_t &oid,
IndexedPath *out_path,
- int *exist) {
+ int *exist)
+{
WRAP_RETRY(
vector<string> path;
string short_name;
- r = _lookup(hoid, &path, &short_name, exist);
+ r = _lookup(oid, &path, &short_name, exist);
if (r < 0)
goto out;
string full_path = get_full_path(path, short_name);
@@ -135,18 +136,18 @@ int LFNIndex::lookup(const hobject_t &hoid,
);
}
-int LFNIndex::collection_list(vector<hobject_t> *ls)
+int LFNIndex::collection_list(vector<ghobject_t> *ls)
{
return _collection_list(ls);
}
-int LFNIndex::collection_list_partial(const hobject_t &start,
+int LFNIndex::collection_list_partial(const ghobject_t &start,
int min_count,
int max_count,
snapid_t seq,
- vector<hobject_t> *ls,
- hobject_t *next)
+ vector<ghobject_t> *ls,
+ ghobject_t *next)
{
return _collection_list_partial(start, min_count, max_count, seq, ls, next);
}
@@ -171,13 +172,14 @@ int LFNIndex::fsync_dir(const vector<string> &path)
int LFNIndex::link_object(const vector<string> &from,
const vector<string> &to,
- const hobject_t &hoid,
- const string &from_short_name) {
+ const ghobject_t &oid,
+ const string &from_short_name)
+{
int r;
string from_path = get_full_path(from, from_short_name);
string to_path;
maybe_inject_failure();
- r = lfn_get_name(to, hoid, 0, &to_path, 0);
+ r = lfn_get_name(to, oid, 0, &to_path, 0);
if (r < 0)
return r;
maybe_inject_failure();
@@ -190,10 +192,11 @@ int LFNIndex::link_object(const vector<string> &from,
}
int LFNIndex::remove_objects(const vector<string> &dir,
- const map<string, hobject_t> &to_remove,
- map<string, hobject_t> *remaining) {
+ const map<string, ghobject_t> &to_remove,
+ map<string, ghobject_t> *remaining)
+{
set<string> clean_chains;
- for (map<string, hobject_t>::const_iterator to_clean = to_remove.begin();
+ for (map<string, ghobject_t>::const_iterator to_clean = to_remove.begin();
to_clean != to_remove.end();
++to_clean) {
if (!lfn_is_hashed_filename(to_clean->first)) {
@@ -207,7 +210,7 @@ int LFNIndex::remove_objects(const vector<string> &dir,
if (clean_chains.count(lfn_get_short_name(to_clean->second, 0)))
continue;
set<int> holes;
- map<int, pair<string, hobject_t> > chain;
+ map<int, pair<string, ghobject_t> > chain;
for (int i = 0; ; ++i) {
string short_name = lfn_get_short_name(to_clean->second, i);
if (remaining->count(short_name)) {
@@ -219,7 +222,7 @@ int LFNIndex::remove_objects(const vector<string> &dir,
}
}
- map<int, pair<string, hobject_t > >::reverse_iterator candidate = chain.rbegin();
+ map<int, pair<string, ghobject_t > >::reverse_iterator candidate = chain.rbegin();
for (set<int>::iterator i = holes.begin();
i != holes.end();
++i) {
@@ -241,7 +244,7 @@ int LFNIndex::remove_objects(const vector<string> &dir,
if (r < 0)
return -errno;
remaining->erase(candidate->second.first);
- remaining->insert(pair<string, hobject_t>(
+ remaining->insert(pair<string, ghobject_t>(
lfn_get_short_name(candidate->second.second, *i),
candidate->second.second));
++candidate;
@@ -253,13 +256,14 @@ int LFNIndex::remove_objects(const vector<string> &dir,
}
int LFNIndex::move_objects(const vector<string> &from,
- const vector<string> &to) {
- map<string, hobject_t> to_move;
+ const vector<string> &to)
+{
+ map<string, ghobject_t> to_move;
int r;
r = list_objects(from, 0, NULL, &to_move);
if (r < 0)
return r;
- for (map<string,hobject_t>::iterator i = to_move.begin();
+ for (map<string,ghobject_t>::iterator i = to_move.begin();
i != to_move.end();
++i) {
string from_path = get_full_path(from, i->first);
@@ -280,7 +284,7 @@ int LFNIndex::move_objects(const vector<string> &from,
r = fsync_dir(to);
if (r < 0)
return r;
- for (map<string,hobject_t>::iterator i = to_move.begin();
+ for (map<string,ghobject_t>::iterator i = to_move.begin();
i != to_move.end();
++i) {
maybe_inject_failure();
@@ -293,21 +297,23 @@ int LFNIndex::move_objects(const vector<string> &from,
}
int LFNIndex::remove_object(const vector<string> &from,
- const hobject_t &hoid) {
+ const ghobject_t &oid)
+{
string short_name;
int r, exist;
maybe_inject_failure();
- r = get_mangled_name(from, hoid, &short_name, &exist);
+ r = get_mangled_name(from, oid, &short_name, &exist);
maybe_inject_failure();
if (r < 0)
return r;
- return lfn_unlink(from, hoid, short_name);
+ return lfn_unlink(from, oid, short_name);
}
int LFNIndex::get_mangled_name(const vector<string> &from,
- const hobject_t &hoid,
- string *mangled_name, int *exists) {
- return lfn_get_name(from, hoid, mangled_name, 0, exists);
+ const ghobject_t &oid,
+ string *mangled_name, int *exists)
+{
+ return lfn_get_name(from, oid, mangled_name, 0, exists);
}
int LFNIndex::move_subdir(
@@ -315,7 +321,8 @@ int LFNIndex::move_subdir(
LFNIndex &dest,
const vector<string> &path,
string dir
- ) {
+ )
+{
vector<string> sub_path(path.begin(), path.end());
sub_path.push_back(dir);
string from_path(from.get_full_path_subdir(sub_path));
@@ -330,8 +337,9 @@ int LFNIndex::move_object(
LFNIndex &from,
LFNIndex &dest,
const vector<string> &path,
- const pair<string, hobject_t> &obj
- ) {
+ const pair<string, ghobject_t> &obj
+ )
+{
string from_path(from.get_full_path(path, obj.first));
string to_path;
string to_name;
@@ -358,7 +366,8 @@ int LFNIndex::move_object(
static int get_hobject_from_oinfo(const char *dir, const char *file,
- hobject_t *o) {
+ ghobject_t *o)
+{
char path[PATH_MAX];
bufferptr bp(PATH_MAX);
snprintf(path, sizeof(path), "%s/%s", dir, file);
@@ -375,7 +384,8 @@ static int get_hobject_from_oinfo(const char *dir, const char *file,
int LFNIndex::list_objects(const vector<string> &to_list, int max_objs,
- long *handle, map<string, hobject_t> *out) {
+ long *handle, map<string, ghobject_t> *out)
+{
string to_list_path = get_full_path_subdir(to_list);
DIR *dir = ::opendir(to_list_path.c_str());
char buf[offsetof(struct dirent, d_name) + PATH_MAX + 1];
@@ -402,7 +412,7 @@ int LFNIndex::list_objects(const vector<string> &to_list, int max_objs,
if (de->d_name[0] == '.')
continue;
string short_name(de->d_name);
- hobject_t obj;
+ ghobject_t obj;
if (lfn_is_object(short_name)) {
r = lfn_translate(to_list, short_name, &obj);
if (r < 0) {
@@ -416,7 +426,7 @@ int LFNIndex::list_objects(const vector<string> &to_list, int max_objs,
if (index_version == HASH_INDEX_TAG)
get_hobject_from_oinfo(to_list_path.c_str(), short_name.c_str(), &obj);
- out->insert(pair<string, hobject_t>(short_name, obj));
+ out->insert(pair<string, ghobject_t>(short_name, obj));
++listed;
} else {
continue;
@@ -435,7 +445,8 @@ int LFNIndex::list_objects(const vector<string> &to_list, int max_objs,
}
int LFNIndex::list_subdirs(const vector<string> &to_list,
- set<string> *out) {
+ set<string> *out)
+{
string to_list_path = get_full_path_subdir(to_list);
DIR *dir = ::opendir(to_list_path.c_str());
char buf[offsetof(struct dirent, d_name) + PATH_MAX + 1];
@@ -449,7 +460,7 @@ int LFNIndex::list_subdirs(const vector<string> &to_list,
}
string short_name(de->d_name);
string demangled_name;
- hobject_t obj;
+ ghobject_t obj;
if (lfn_is_subdir(short_name, &demangled_name)) {
out->insert(demangled_name);
}
@@ -501,7 +512,8 @@ int LFNIndex::path_exists(const vector<string> &to_check, int *exists)
int LFNIndex::add_attr_path(const vector<string> &path,
const string &attr_name,
- bufferlist &attr_value) {
+ bufferlist &attr_value)
+{
string full_path = get_full_path_subdir(path);
maybe_inject_failure();
return chain_setxattr(full_path.c_str(), mangle_attr_name(attr_name).c_str(),
@@ -511,7 +523,8 @@ int LFNIndex::add_attr_path(const vector<string> &path,
int LFNIndex::get_attr_path(const vector<string> &path,
const string &attr_name,
- bufferlist &attr_value) {
+ bufferlist &attr_value)
+{
string full_path = get_full_path_subdir(path);
size_t size = 1024; // Initial
while (1) {
@@ -536,22 +549,24 @@ int LFNIndex::get_attr_path(const vector<string> &path,
}
int LFNIndex::remove_attr_path(const vector<string> &path,
- const string &attr_name) {
+ const string &attr_name)
+{
string full_path = get_full_path_subdir(path);
string mangled_attr_name = mangle_attr_name(attr_name);
maybe_inject_failure();
return chain_removexattr(full_path.c_str(), mangled_attr_name.c_str());
}
-string LFNIndex::lfn_generate_object_name_keyless(const hobject_t &hoid)
+string LFNIndex::lfn_generate_object_name_keyless(const ghobject_t &oid)
{
char s[FILENAME_MAX_LEN];
char *end = s + sizeof(s);
char *t = s;
- const char *i = hoid.oid.name.c_str();
+ assert(oid.generation == ghobject_t::NO_GEN);
+ const char *i = oid.hobj.oid.name.c_str();
// Escape subdir prefix
- if (hoid.oid.name.substr(0, 4) == "DIR_") {
+ if (oid.hobj.oid.name.substr(0, 4) == "DIR_") {
*t++ = '\\';
*t++ = 'd';
i += 4;
@@ -560,7 +575,7 @@ string LFNIndex::lfn_generate_object_name_keyless(const hobject_t &hoid)
if (*i == '\\') {
*t++ = '\\';
*t++ = '\\';
- } else if (*i == '.' && i == hoid.oid.name.c_str()) { // only escape leading .
+ } else if (*i == '.' && i == oid.hobj.oid.name.c_str()) { // only escape leading .
*t++ = '\\';
*t++ = '.';
} else if (*i == '/') {
@@ -571,13 +586,13 @@ string LFNIndex::lfn_generate_object_name_keyless(const hobject_t &hoid)
i++;
}
- if (hoid.snap == CEPH_NOSNAP)
+ if (oid.hobj.snap == CEPH_NOSNAP)
t += snprintf(t, end - t, "_head");
- else if (hoid.snap == CEPH_SNAPDIR)
+ else if (oid.hobj.snap == CEPH_SNAPDIR)
t += snprintf(t, end - t, "_snapdir");
else
- t += snprintf(t, end - t, "_%llx", (long long unsigned)hoid.snap);
- snprintf(t, end - t, "_%.*X", (int)(sizeof(hoid.hash)*2), hoid.hash);
+ t += snprintf(t, end - t, "_%llx", (long long unsigned)oid.hobj.snap);
+ snprintf(t, end - t, "_%.*X", (int)(sizeof(oid.hobj.hash)*2), oid.hobj.hash);
return string(s);
}
@@ -601,94 +616,112 @@ static void append_escaped(string::const_iterator begin,
}
}
-string LFNIndex::lfn_generate_object_name(const hobject_t &hoid)
+string LFNIndex::lfn_generate_object_name(const ghobject_t &oid)
{
if (index_version == HASH_INDEX_TAG)
- return lfn_generate_object_name_keyless(hoid);
+ return lfn_generate_object_name_keyless(oid);
if (index_version == HASH_INDEX_TAG_2)
- return lfn_generate_object_name_poolless(hoid);
+ return lfn_generate_object_name_poolless(oid);
string full_name;
- string::const_iterator i = hoid.oid.name.begin();
- if (hoid.oid.name.substr(0, 4) == "DIR_") {
+ string::const_iterator i = oid.hobj.oid.name.begin();
+ if (oid.hobj.oid.name.substr(0, 4) == "DIR_") {
full_name.append("\\d");
i += 4;
- } else if (hoid.oid.name[0] == '.') {
+ } else if (oid.hobj.oid.name[0] == '.') {
full_name.append("\\.");
++i;
}
- append_escaped(i, hoid.oid.name.end(), &full_name);
+ append_escaped(i, oid.hobj.oid.name.end(), &full_name);
full_name.append("_");
- append_escaped(hoid.get_key().begin(), hoid.get_key().end(), &full_name);
+ append_escaped(oid.hobj.get_key().begin(), oid.hobj.get_key().end(), &full_name);
full_name.append("_");
char buf[PATH_MAX];
char *t = buf;
char *end = t + sizeof(buf);
- if (hoid.snap == CEPH_NOSNAP)
+ if (oid.hobj.snap == CEPH_NOSNAP)
t += snprintf(t, end - t, "head");
- else if (hoid.snap == CEPH_SNAPDIR)
+ else if (oid.hobj.snap == CEPH_SNAPDIR)
t += snprintf(t, end - t, "snapdir");
else
- t += snprintf(t, end - t, "%llx", (long long unsigned)hoid.snap);
- snprintf(t, end - t, "_%.*X", (int)(sizeof(hoid.hash)*2), hoid.hash);
+ t += snprintf(t, end - t, "%llx", (long long unsigned)oid.hobj.snap);
+ snprintf(t, end - t, "_%.*X", (int)(sizeof(oid.hobj.hash)*2), oid.hobj.hash);
full_name += string(buf);
full_name.append("_");
- append_escaped(hoid.nspace.begin(), hoid.nspace.end(), &full_name);
+ append_escaped(oid.hobj.nspace.begin(), oid.hobj.nspace.end(), &full_name);
full_name.append("_");
t = buf;
end = t + sizeof(buf);
- if (hoid.pool == -1)
+ if (oid.hobj.pool == -1)
t += snprintf(t, end - t, "none");
else
- t += snprintf(t, end - t, "%llx", (long long unsigned)hoid.pool);
+ t += snprintf(t, end - t, "%llx", (long long unsigned)oid.hobj.pool);
full_name += string(buf);
+ if (oid.generation != ghobject_t::NO_GEN) {
+ assert(oid.shard_id != ghobject_t::NO_SHARD);
+ full_name.append("_");
+
+ t = buf;
+ end = t + sizeof(buf);
+ t += snprintf(t, end - t, "%llx", (long long unsigned)oid.generation);
+ full_name += string(buf);
+
+ full_name.append("_");
+
+ t = buf;
+ end = t + sizeof(buf);
+ t += snprintf(t, end - t, "%x", (int)oid.shard_id);
+ full_name += string(buf);
+ }
+
return full_name;
}
-string LFNIndex::lfn_generate_object_name_poolless(const hobject_t &hoid)
+string LFNIndex::lfn_generate_object_name_poolless(const ghobject_t &oid)
{
if (index_version == HASH_INDEX_TAG)
- return lfn_generate_object_name_keyless(hoid);
+ return lfn_generate_object_name_keyless(oid);
+ assert(oid.generation == ghobject_t::NO_GEN);
string full_name;
- string::const_iterator i = hoid.oid.name.begin();
- if (hoid.oid.name.substr(0, 4) == "DIR_") {
+ string::const_iterator i = oid.hobj.oid.name.begin();
+ if (oid.hobj.oid.name.substr(0, 4) == "DIR_") {
full_name.append("\\d");
i += 4;
- } else if (hoid.oid.name[0] == '.') {
+ } else if (oid.hobj.oid.name[0] == '.') {
full_name.append("\\.");
++i;
}
- append_escaped(i, hoid.oid.name.end(), &full_name);
+ append_escaped(i, oid.hobj.oid.name.end(), &full_name);
full_name.append("_");
- append_escaped(hoid.get_key().begin(), hoid.get_key().end(), &full_name);
+ append_escaped(oid.hobj.get_key().begin(), oid.hobj.get_key().end(), &full_name);
full_name.append("_");
char snap_with_hash[PATH_MAX];
char *t = snap_with_hash;
char *end = t + sizeof(snap_with_hash);
- if (hoid.snap == CEPH_NOSNAP)
+ if (oid.hobj.snap == CEPH_NOSNAP)
t += snprintf(t, end - t, "head");
- else if (hoid.snap == CEPH_SNAPDIR)
+ else if (oid.hobj.snap == CEPH_SNAPDIR)
t += snprintf(t, end - t, "snapdir");
else
- t += snprintf(t, end - t, "%llx", (long long unsigned)hoid.snap);
- snprintf(t, end - t, "_%.*X", (int)(sizeof(hoid.hash)*2), hoid.hash);
+ t += snprintf(t, end - t, "%llx", (long long unsigned)oid.hobj.snap);
+ snprintf(t, end - t, "_%.*X", (int)(sizeof(oid.hobj.hash)*2), oid.hobj.hash);
full_name += string(snap_with_hash);
return full_name;
}
int LFNIndex::lfn_get_name(const vector<string> &path,
- const hobject_t &hoid,
+ const ghobject_t &oid,
string *mangled_name, string *out_path,
int *exists)
{
string subdir_path = get_full_path_subdir(path);
- string full_name = lfn_generate_object_name(hoid);
+ string full_name = lfn_generate_object_name(oid);
int r;
if (!lfn_must_hash(full_name)) {
@@ -718,7 +751,7 @@ int LFNIndex::lfn_get_name(const vector<string> &path,
string candidate_path;
char buf[FILENAME_MAX_LEN + 1];
for ( ; ; ++i) {
- candidate = lfn_get_short_name(hoid, i);
+ candidate = lfn_get_short_name(oid, i);
candidate_path = get_full_path(path, candidate);
r = chain_getxattr(candidate_path.c_str(), get_lfn_attr().c_str(), buf, sizeof(buf));
if (r < 0) {
@@ -757,20 +790,20 @@ int LFNIndex::lfn_get_name(const vector<string> &path,
}
int LFNIndex::lfn_created(const vector<string> &path,
- const hobject_t &hoid,
+ const ghobject_t &oid,
const string &mangled_name)
{
if (!lfn_is_hashed_filename(mangled_name))
return 0;
string full_path = get_full_path(path, mangled_name);
- string full_name = lfn_generate_object_name(hoid);
+ string full_name = lfn_generate_object_name(oid);
maybe_inject_failure();
return chain_setxattr(full_path.c_str(), get_lfn_attr().c_str(),
full_name.c_str(), full_name.size());
}
int LFNIndex::lfn_unlink(const vector<string> &path,
- const hobject_t &hoid,
+ const ghobject_t &oid,
const string &mangled_name)
{
if (!lfn_is_hashed_filename(mangled_name)) {
@@ -787,7 +820,7 @@ int LFNIndex::lfn_unlink(const vector<string> &path,
int i = 0;
for ( ; ; ++i) {
- string candidate = lfn_get_short_name(hoid, i);
+ string candidate = lfn_get_short_name(oid, i);
if (candidate == mangled_name)
break;
}
@@ -795,7 +828,7 @@ int LFNIndex::lfn_unlink(const vector<string> &path,
++i;
for ( ; ; ++i) {
struct stat buf;
- string to_check = lfn_get_short_name(hoid, i);
+ string to_check = lfn_get_short_name(oid, i);
string to_check_path = get_full_path(path, to_check);
int r = ::stat(to_check_path.c_str(), &buf);
if (r < 0) {
@@ -817,7 +850,7 @@ int LFNIndex::lfn_unlink(const vector<string> &path,
return 0;
} else {
string rename_to = get_full_path(path, mangled_name);
- string rename_from = get_full_path(path, lfn_get_short_name(hoid, i - 1));
+ string rename_from = get_full_path(path, lfn_get_short_name(oid, i - 1));
maybe_inject_failure();
int r = ::rename(rename_from.c_str(), rename_to.c_str());
maybe_inject_failure();
@@ -830,7 +863,7 @@ int LFNIndex::lfn_unlink(const vector<string> &path,
int LFNIndex::lfn_translate(const vector<string> &path,
const string &short_name,
- hobject_t *out)
+ ghobject_t *out)
{
if (!lfn_is_hashed_filename(short_name)) {
return lfn_parse_object_name(short_name, out);
@@ -863,7 +896,7 @@ bool LFNIndex::lfn_is_subdir(const string &name, string *demangled)
return 0;
}
-static int parse_object(const char *s, hobject_t& o)
+static int parse_object(const char *s, ghobject_t& o)
{
const char *hash = s + strlen(s) - 1;
while (*hash != '_' &&
@@ -899,28 +932,28 @@ static int parse_object(const char *s, hobject_t& o)
i++;
}
*t = 0;
- o.oid.name = string(buf, t-buf);
+ o.hobj.oid.name = string(buf, t-buf);
if (strncmp(bar+1, "head", 4) == 0)
- o.snap = CEPH_NOSNAP;
+ o.hobj.snap = CEPH_NOSNAP;
else if (strncmp(bar+1, "snapdir", 7) == 0)
- o.snap = CEPH_SNAPDIR;
+ o.hobj.snap = CEPH_SNAPDIR;
else
- o.snap = strtoull(bar+1, NULL, 16);
- sscanf(hash, "_%X", &o.hash);
+ o.hobj.snap = strtoull(bar+1, NULL, 16);
+ sscanf(hash, "_%X", &o.hobj.hash);
return 1;
}
return 0;
}
-bool LFNIndex::lfn_parse_object_name_keyless(const string &long_name, hobject_t *out)
+bool LFNIndex::lfn_parse_object_name_keyless(const string &long_name, ghobject_t *out)
{
bool r = parse_object(long_name.c_str(), *out);
int64_t pool = -1;
pg_t pg;
if (coll().is_pg_prefix(pg))
pool = (int64_t)pg.pool();
- out->pool = pool;
+ out->hobj.pool = pool;
if (!r) return r;
string temp = lfn_generate_object_name(*out);
return r;
@@ -928,7 +961,8 @@ bool LFNIndex::lfn_parse_object_name_keyless(const string &long_name, hobject_t
static bool append_unescaped(string::const_iterator begin,
string::const_iterator end,
- string *out) {
+ string *out)
+{
for (string::const_iterator i = begin; i != end; ++i) {
if (*i == '\\') {
++i;
@@ -950,7 +984,8 @@ static bool append_unescaped(string::const_iterator begin,
}
bool LFNIndex::lfn_parse_object_name_poolless(const string &long_name,
- hobject_t *out) {
+ ghobject_t *out)
+{
string name;
string key;
uint32_t hash;
@@ -1011,12 +1046,12 @@ bool LFNIndex::lfn_parse_object_name_poolless(const string &long_name,
pg_t pg;
if (coll().is_pg_prefix(pg))
pool = (int64_t)pg.pool();
- (*out) = hobject_t(name, key, snap, hash, pool, "");
+ (*out) = ghobject_t(hobject_t(name, key, snap, hash, pool, ""));
return true;
}
-bool LFNIndex::lfn_parse_object_name(const string &long_name, hobject_t *out)
+bool LFNIndex::lfn_parse_object_name(const string &long_name, ghobject_t *out)
{
string name;
string key;
@@ -1024,6 +1059,8 @@ bool LFNIndex::lfn_parse_object_name(const string &long_name, hobject_t *out)
uint32_t hash;
snapid_t snap;
uint64_t pool;
+ gen_t generation = ghobject_t::NO_GEN;
+ shard_t shard_id = ghobject_t::NO_SHARD;
if (index_version == HASH_INDEX_TAG)
return lfn_parse_object_name_keyless(long_name, out);
@@ -1081,10 +1118,28 @@ bool LFNIndex::lfn_parse_object_name(const string &long_name, hobject_t *out)
current = ++end;
for ( ; end != long_name.end() && *end != '_'; ++end) ;
- if (end != long_name.end())
- return false;
string pstring(current, end);
+ // Optional generation/shard_id
+ string genstring, shardstring;
+ if (end != long_name.end()) {
+ current = ++end;
+ for ( ; end != long_name.end() && *end != '_'; ++end) ;
+ if (end == long_name.end())
+ return false;
+ genstring = string(current, end);
+
+ generation = (gen_t)strtoull(genstring.c_str(), NULL, 16);
+
+ current = ++end;
+ for ( ; end != long_name.end() && *end != '_'; ++end) ;
+ if (end != long_name.end())
+ return false;
+ shardstring = string(current, end);
+
+ shard_id = (shard_t)strtoul(shardstring.c_str(), NULL, 16);
+ }
+
if (snap_str == "head")
snap = CEPH_NOSNAP;
else if (snap_str == "snapdir")
@@ -1098,7 +1153,7 @@ bool LFNIndex::lfn_parse_object_name(const string &long_name, hobject_t *out)
else
pool = strtoull(pstring.c_str(), NULL, 16);
- (*out) = hobject_t(name, key, snap, hash, (int64_t)pool, ns);
+ (*out) = ghobject_t(hobject_t(name, key, snap, hash, (int64_t)pool, ns), generation, shard_id);
return true;
}
@@ -1170,9 +1225,9 @@ void LFNIndex::build_filename(const char *old_filename, int i, char *filename, i
}
}
-string LFNIndex::lfn_get_short_name(const hobject_t &hoid, int i)
+string LFNIndex::lfn_get_short_name(const ghobject_t &oid, int i)
{
- string long_name = lfn_generate_object_name(hoid);
+ string long_name = lfn_generate_object_name(oid);
assert(lfn_must_hash(long_name));
char buf[FILENAME_SHORT_LEN + 4];
build_filename(long_name.c_str(), i, buf, sizeof(buf));
@@ -1212,7 +1267,7 @@ string LFNIndex::demangle_path_component(const string &component)
}
int LFNIndex::decompose_full_path(const char *in, vector<string> *out,
- hobject_t *hoid, string *shortname)
+ ghobject_t *oid, string *shortname)
{
const char *beginning = in + get_base_path().size();
const char *end = beginning;
@@ -1228,8 +1283,8 @@ int LFNIndex::decompose_full_path(const char *in, vector<string> *out,
}
}
*shortname = string(beginning, end - beginning);
- if (hoid) {
- int r = lfn_translate(*out, *shortname, hoid);
+ if (oid) {
+ int r = lfn_translate(*out, *shortname, oid);
if (r < 0)
return r;
}
diff --git a/src/os/LFNIndex.h b/src/os/LFNIndex.h
index b73ff4db268..f436446bf0f 100644
--- a/src/os/LFNIndex.h
+++ b/src/os/LFNIndex.h
@@ -165,35 +165,35 @@ public:
/// @see CollectionIndex
int created(
- const hobject_t &hoid,
+ const ghobject_t &oid,
const char *path
);
/// @see CollectionIndex
int unlink(
- const hobject_t &hoid
+ const ghobject_t &oid
);
/// @see CollectionIndex
int lookup(
- const hobject_t &hoid,
+ const ghobject_t &oid,
IndexedPath *path,
int *exist
);
/// @see CollectionIndex
int collection_list(
- vector<hobject_t> *ls
+ vector<ghobject_t> *ls
);
/// @see CollectionIndex
int collection_list_partial(
- const hobject_t &start,
+ const ghobject_t &start,
int min_count,
int max_count,
snapid_t seq,
- vector<hobject_t> *ls,
- hobject_t *next
+ vector<ghobject_t> *ls,
+ ghobject_t *next
);
virtual int _split(
@@ -221,20 +221,20 @@ protected:
/// Will be called upon object creation
virtual int _created(
const vector<string> &path, ///< [in] Path to subdir.
- const hobject_t &hoid, ///< [in] Object created.
+ const ghobject_t &oid, ///< [in] Object created.
const string &mangled_name ///< [in] Mangled filename.
) = 0;
/// Will be called to remove an object
virtual int _remove(
const vector<string> &path, ///< [in] Path to subdir.
- const hobject_t &hoid, ///< [in] Object to remove.
+ const ghobject_t &oid, ///< [in] Object to remove.
const string &mangled_name ///< [in] Mangled filename.
) = 0;
- /// Return the path and mangled_name for hoid.
+ /// Return the path and mangled_name for oid.
virtual int _lookup(
- const hobject_t &hoid,///< [in] Object for lookup.
+ const ghobject_t &oid,///< [in] Object for lookup.
vector<string> *path, ///< [out] Path to the object.
string *mangled_name, ///< [out] Mangled filename.
int *exists ///< [out] True if the object exists.
@@ -252,17 +252,17 @@ protected:
*/
/// List contents of collection.
virtual int _collection_list(
- vector<hobject_t> *ls ///< [out] Listed objects.
+ vector<ghobject_t> *ls ///< [out] Listed objects.
) = 0;
/// @see CollectionIndex
virtual int _collection_list_partial(
- const hobject_t &start,
+ const ghobject_t &start,
int min_count,
int max_count,
snapid_t seq,
- vector<hobject_t> *ls,
- hobject_t *next
+ vector<ghobject_t> *ls,
+ ghobject_t *next
) = 0;
protected:
@@ -278,8 +278,8 @@ protected:
int link_object(
const vector<string> &from, ///< [in] Source subdirectory.
const vector<string> &to, ///< [in] Dest subdirectory.
- const hobject_t &hoid, ///< [in] Object to move.
- const string &from_short_name ///< [in] Mangled filename of hoid.
+ const ghobject_t &oid, ///< [in] Object to move.
+ const string &from_short_name ///< [in] Mangled filename of oid.
); ///< @return Error Code, 0 on success
/**
@@ -296,8 +296,8 @@ protected:
*/
int remove_objects(
const vector<string> &dir,
- const map<string, hobject_t> &to_remove,
- map<string, hobject_t> *remaining
+ const map<string, ghobject_t> &to_remove,
+ map<string, ghobject_t> *remaining
);
@@ -322,11 +322,11 @@ protected:
*/
int remove_object(
const vector<string> &from, ///< [in] Directory from which to remove.
- const hobject_t &to_remove ///< [in] Object to remove.
+ const ghobject_t &to_remove ///< [in] Object to remove.
);
/**
- * Gets the filename corresponding to hoid in from.
+ * Gets the filename corresponding to oid in from.
*
* The filename may differ between subdirectories. Furthermore,
* file creations ore removals in from may invalidate the name.
@@ -334,7 +334,7 @@ protected:
*/
int get_mangled_name(
const vector<string> &from, ///< [in] Subdirectory
- const hobject_t &hoid, ///< [in] Object
+ const ghobject_t &oid, ///< [in] Object
string *mangled_name, ///< [out] Filename
int *exists ///< [out] 1 if the file exists, else 0
);
@@ -352,7 +352,7 @@ protected:
LFNIndex &from, ///< [in] from index
LFNIndex &dest, ///< [in] to index
const vector<string> &path, ///< [in] path to split
- const pair<string, hobject_t> &obj ///< [in] obj to move
+ const pair<string, ghobject_t> &obj ///< [in] obj to move
);
/**
@@ -369,7 +369,7 @@ protected:
const vector<string> &to_list,
int max_objects,
long *handle,
- map<string, hobject_t> *out
+ map<string, ghobject_t> *out
);
/// Lists subdirectories.
@@ -425,43 +425,43 @@ private:
}
/**
- * Gets the filename corresponsing to hoid in path.
+ * Gets the filename corresponsing to oid in path.
*
- * @param [in] path Path in which to get filename for hoid.
- * @param [in] hoid Object for which to get filename.
- * @param [out] mangled_name Filename for hoid, pass NULL if not needed.
- * @param [out] full_path Fullpath for hoid, pass NULL if not needed.
+ * @param [in] path Path in which to get filename for oid.
+ * @param [in] oid Object for which to get filename.
+ * @param [out] mangled_name Filename for oid, pass NULL if not needed.
+ * @param [out] full_path Fullpath for oid, pass NULL if not needed.
* @param [out] exists 1 if the file exists, 0 otherwise, pass NULL if
* not needed
* @return Error Code, 0 on success.
*/
int lfn_get_name(
const vector<string> &path,
- const hobject_t &hoid,
+ const ghobject_t &oid,
string *mangled_name,
string *full_path,
int *exists
);
- /// Adjusts path contents when hoid is created at name mangled_name.
+ /// Adjusts path contents when oid is created at name mangled_name.
int lfn_created(
const vector<string> &path, ///< [in] Path to adjust.
- const hobject_t &hoid, ///< [in] Object created.
+ const ghobject_t &oid, ///< [in] Object created.
const string &mangled_name ///< [in] Filename of created object.
);
- /// Removes hoid from path while adjusting path contents
+ /// Removes oid from path while adjusting path contents
int lfn_unlink(
- const vector<string> &path, ///< [in] Path containing hoid.
- const hobject_t &hoid, ///< [in] Object to remove.
+ const vector<string> &path, ///< [in] Path containing oid.
+ const ghobject_t &oid, ///< [in] Object to remove.
const string &mangled_name ///< [in] Filename of object to remove.
);
- ///Transate a file into and hobject_t.
+ ///Transate a file into and ghobject_t.
int lfn_translate(
const vector<string> &path, ///< [in] Path containing the file.
const string &short_name, ///< [in] Filename to translate.
- hobject_t *out ///< [out] Object found.
+ ghobject_t *out ///< [out] Object found.
); ///< @return Negative error code on error, 0 if not an object, 1 else
/* manglers/demanglers */
@@ -478,35 +478,35 @@ private:
/// Generate object name
string lfn_generate_object_name_keyless(
- const hobject_t &hoid ///< [in] Object for which to generate.
+ const ghobject_t &oid ///< [in] Object for which to generate.
); ///< @return Generated object name.
/// Generate object name
string lfn_generate_object_name_poolless(
- const hobject_t &hoid ///< [in] Object for which to generate.
+ const ghobject_t &oid ///< [in] Object for which to generate.
); ///< @return Generated object name.
/// Generate object name
string lfn_generate_object_name(
- const hobject_t &hoid ///< [in] Object for which to generate.
+ const ghobject_t &oid ///< [in] Object for which to generate.
); ///< @return Generated object name.
/// Parse object name
bool lfn_parse_object_name_keyless(
const string &long_name, ///< [in] Name to parse
- hobject_t *out ///< [out] Resulting Object
+ ghobject_t *out ///< [out] Resulting Object
); ///< @return True if successfull, False otherwise.
/// Parse object name
bool lfn_parse_object_name_poolless(
const string &long_name, ///< [in] Name to parse
- hobject_t *out ///< [out] Resulting Object
+ ghobject_t *out ///< [out] Resulting Object
); ///< @return True if successfull, False otherwise.
/// Parse object name
bool lfn_parse_object_name(
const string &long_name, ///< [in] Name to parse
- hobject_t *out ///< [out] Resulting Object
+ ghobject_t *out ///< [out] Resulting Object
); ///< @return True if successfull, False otherwise.
/// Checks whether short_name is a hashed filename.
@@ -521,7 +521,7 @@ private:
/// Generate hashed name.
string lfn_get_short_name(
- const hobject_t &hoid, ///< [in] Object for which to generate.
+ const ghobject_t &oid, ///< [in] Object for which to generate.
int i ///< [in] Index of hashed name to generate.
); ///< @return Hashed filename.
@@ -554,7 +554,7 @@ private:
int decompose_full_path(
const char *in, ///< [in] Full path to object.
vector<string> *out, ///< [out] Path to object at in.
- hobject_t *hoid, ///< [out] Object at in.
+ ghobject_t *oid, ///< [out] Object at in.
string *shortname ///< [out] Filename of object at in.
); ///< @return Error Code, 0 on success.
diff --git a/src/os/Makefile.am b/src/os/Makefile.am
index b7fef8dd209..4f12a6a3278 100644
--- a/src/os/Makefile.am
+++ b/src/os/Makefile.am
@@ -13,7 +13,8 @@ libos_la_SOURCES = \
os/WBThrottle.cc \
os/BtrfsFileStoreBackend.cc \
os/GenericFileStoreBackend.cc \
- os/ZFSFileStoreBackend.cc
+ os/ZFSFileStoreBackend.cc \
+ common/TrackedOp.cc
noinst_LTLIBRARIES += libos.la
noinst_HEADERS += \
diff --git a/src/os/ObjectMap.h b/src/os/ObjectMap.h
index 5cc1e495de1..7717aac7437 100644
--- a/src/os/ObjectMap.h
+++ b/src/os/ObjectMap.h
@@ -30,102 +30,102 @@ class ObjectMap {
public:
/// Set keys and values from specified map
virtual int set_keys(
- const hobject_t &hoid, ///< [in] object containing map
+ const ghobject_t &oid, ///< [in] object containing map
const map<string, bufferlist> &set, ///< [in] key to value map to set
const SequencerPosition *spos=0 ///< [in] sequencer position
) = 0;
/// Set header
virtual int set_header(
- const hobject_t &hoid, ///< [in] object containing map
+ const ghobject_t &oid, ///< [in] object containing map
const bufferlist &bl, ///< [in] header to set
const SequencerPosition *spos=0 ///< [in] sequencer position
) = 0;
/// Retrieve header
virtual int get_header(
- const hobject_t &hoid, ///< [in] object containing map
+ const ghobject_t &oid, ///< [in] object containing map
bufferlist *bl ///< [out] header to set
) = 0;
- /// Clear all map keys and values from hoid
+ /// Clear all map keys and values from oid
virtual int clear(
- const hobject_t &hoid, ///< [in] object containing map
+ const ghobject_t &oid, ///< [in] object containing map
const SequencerPosition *spos=0 ///< [in] sequencer position
) = 0;
- /// Clear all map keys and values from hoid
+ /// Clear all map keys and values from oid
virtual int rm_keys(
- const hobject_t &hoid, ///< [in] object containing map
+ const ghobject_t &oid, ///< [in] object containing map
const set<string> &to_clear, ///< [in] Keys to clear
const SequencerPosition *spos=0 ///< [in] sequencer position
) = 0;
/// Get all keys and values
virtual int get(
- const hobject_t &hoid, ///< [in] object containing map
+ const ghobject_t &oid, ///< [in] object containing map
bufferlist *header, ///< [out] Returned Header
map<string, bufferlist> *out ///< [out] Returned keys and values
) = 0;
/// Get values for supplied keys
virtual int get_keys(
- const hobject_t &hoid, ///< [in] object containing map
- set<string> *keys ///< [out] Keys defined on hoid
+ const ghobject_t &oid, ///< [in] object containing map
+ set<string> *keys ///< [out] Keys defined on oid
) = 0;
/// Get values for supplied keys
virtual int get_values(
- const hobject_t &hoid, ///< [in] object containing map
+ const ghobject_t &oid, ///< [in] object containing map
const set<string> &keys, ///< [in] Keys to get
map<string, bufferlist> *out ///< [out] Returned keys and values
) = 0;
/// Check key existence
virtual int check_keys(
- const hobject_t &hoid, ///< [in] object containing map
+ const ghobject_t &oid, ///< [in] object containing map
const set<string> &keys, ///< [in] Keys to check
- set<string> *out ///< [out] Subset of keys defined on hoid
+ set<string> *out ///< [out] Subset of keys defined on oid
) = 0;
/// Get xattrs
virtual int get_xattrs(
- const hobject_t &hoid, ///< [in] object
+ const ghobject_t &oid, ///< [in] object
const set<string> &to_get, ///< [in] keys to get
map<string, bufferlist> *out ///< [out] subset of attrs/vals defined
) = 0;
/// Get all xattrs
virtual int get_all_xattrs(
- const hobject_t &hoid, ///< [in] object
+ const ghobject_t &oid, ///< [in] object
set<string> *out ///< [out] attrs and values
) = 0;
/// set xattrs in to_set
virtual int set_xattrs(
- const hobject_t &hoid, ///< [in] object
+ const ghobject_t &oid, ///< [in] object
const map<string, bufferlist> &to_set,///< [in] attrs/values to set
const SequencerPosition *spos=0 ///< [in] sequencer position
) = 0;
/// remove xattrs in to_remove
virtual int remove_xattrs(
- const hobject_t &hoid, ///< [in] object
+ const ghobject_t &oid, ///< [in] object
const set<string> &to_remove, ///< [in] attrs to remove
const SequencerPosition *spos=0 ///< [in] sequencer position
) = 0;
- /// Clone keys efficiently from hoid map to target map
+ /// Clone keys efficiently from oid map to target map
virtual int clone(
- const hobject_t &hoid, ///< [in] object containing map
- const hobject_t &target, ///< [in] target of clone
+ const ghobject_t &oid, ///< [in] object containing map
+ const ghobject_t &target, ///< [in] target of clone
const SequencerPosition *spos=0 ///< [in] sequencer position
) { return 0; }
/// Ensure all previous writes are durable
virtual int sync(
- const hobject_t *hoid=0, ///< [in] object
+ const ghobject_t *oid=0, ///< [in] object
const SequencerPosition *spos=0 ///< [in] Sequencer
) { return 0; }
@@ -144,7 +144,7 @@ public:
virtual ~ObjectMapIteratorImpl() {}
};
typedef std::tr1::shared_ptr<ObjectMapIteratorImpl> ObjectMapIterator;
- virtual ObjectMapIterator get_iterator(const hobject_t &hoid) {
+ virtual ObjectMapIterator get_iterator(const ghobject_t &oid) {
return ObjectMapIterator();
}
diff --git a/src/os/ObjectStore.cc b/src/os/ObjectStore.cc
index 9d8b989225b..1a1bbcb0b67 100644
--- a/src/os/ObjectStore.cc
+++ b/src/os/ObjectStore.cc
@@ -15,6 +15,7 @@
#include <tr1/memory>
#include "ObjectStore.h"
#include "common/Formatter.h"
+#include "FileStore.h"
ostream& operator<<(ostream& out, const ObjectStore::Sequencer& s)
{
@@ -77,7 +78,7 @@ void ObjectStore::Transaction::dump(ceph::Formatter *f)
case Transaction::OP_TOUCH:
{
coll_t cid = i.get_cid();
- hobject_t oid = i.get_oid();
+ ghobject_t oid = i.get_oid();
f->dump_string("op_name", "touch");
f->dump_stream("collection") << cid;
f->dump_stream("oid") << oid;
@@ -87,7 +88,7 @@ void ObjectStore::Transaction::dump(ceph::Formatter *f)
case Transaction::OP_WRITE:
{
coll_t cid = i.get_cid();
- hobject_t oid = i.get_oid();
+ ghobject_t oid = i.get_oid();
uint64_t off = i.get_length();
uint64_t len = i.get_length();
bufferlist bl;
@@ -104,7 +105,7 @@ void ObjectStore::Transaction::dump(ceph::Formatter *f)
case Transaction::OP_ZERO:
{
coll_t cid = i.get_cid();
- hobject_t oid = i.get_oid();
+ ghobject_t oid = i.get_oid();
uint64_t off = i.get_length();
uint64_t len = i.get_length();
f->dump_string("op_name", "zero");
@@ -118,7 +119,7 @@ void ObjectStore::Transaction::dump(ceph::Formatter *f)
case Transaction::OP_TRIMCACHE:
{
coll_t cid = i.get_cid();
- hobject_t oid = i.get_oid();
+ ghobject_t oid = i.get_oid();
uint64_t off = i.get_length();
uint64_t len = i.get_length();
f->dump_string("op_name", "trim_cache");
@@ -132,7 +133,7 @@ void ObjectStore::Transaction::dump(ceph::Formatter *f)
case Transaction::OP_TRUNCATE:
{
coll_t cid = i.get_cid();
- hobject_t oid = i.get_oid();
+ ghobject_t oid = i.get_oid();
uint64_t off = i.get_length();
f->dump_string("op_name", "truncate");
f->dump_stream("collection") << cid;
@@ -144,7 +145,7 @@ void ObjectStore::Transaction::dump(ceph::Formatter *f)
case Transaction::OP_REMOVE:
{
coll_t cid = i.get_cid();
- hobject_t oid = i.get_oid();
+ ghobject_t oid = i.get_oid();
f->dump_string("op_name", "remove");
f->dump_stream("collection") << cid;
f->dump_stream("oid") << oid;
@@ -154,7 +155,7 @@ void ObjectStore::Transaction::dump(ceph::Formatter *f)
case Transaction::OP_SETATTR:
{
coll_t cid = i.get_cid();
- hobject_t oid = i.get_oid();
+ ghobject_t oid = i.get_oid();
string name = i.get_attrname();
bufferlist bl;
i.get_bl(bl);
@@ -169,7 +170,7 @@ void ObjectStore::Transaction::dump(ceph::Formatter *f)
case Transaction::OP_SETATTRS:
{
coll_t cid = i.get_cid();
- hobject_t oid = i.get_oid();
+ ghobject_t oid = i.get_oid();
map<string, bufferptr> aset;
i.get_attrset(aset);
f->dump_string("op_name", "setattrs");
@@ -187,7 +188,7 @@ void ObjectStore::Transaction::dump(ceph::Formatter *f)
case Transaction::OP_RMATTR:
{
coll_t cid = i.get_cid();
- hobject_t oid = i.get_oid();
+ ghobject_t oid = i.get_oid();
string name = i.get_attrname();
f->dump_string("op_name", "rmattr");
f->dump_stream("collection") << cid;
@@ -199,7 +200,7 @@ void ObjectStore::Transaction::dump(ceph::Formatter *f)
case Transaction::OP_RMATTRS:
{
coll_t cid = i.get_cid();
- hobject_t oid = i.get_oid();
+ ghobject_t oid = i.get_oid();
f->dump_string("op_name", "rmattrs");
f->dump_stream("collection") << cid;
f->dump_stream("oid") << oid;
@@ -209,8 +210,8 @@ void ObjectStore::Transaction::dump(ceph::Formatter *f)
case Transaction::OP_CLONE:
{
coll_t cid = i.get_cid();
- hobject_t oid = i.get_oid();
- hobject_t noid = i.get_oid();
+ ghobject_t oid = i.get_oid();
+ ghobject_t noid = i.get_oid();
f->dump_string("op_name", "clone");
f->dump_stream("collection") << cid;
f->dump_stream("src_oid") << oid;
@@ -221,8 +222,8 @@ void ObjectStore::Transaction::dump(ceph::Formatter *f)
case Transaction::OP_CLONERANGE:
{
coll_t cid = i.get_cid();
- hobject_t oid = i.get_oid();
- hobject_t noid = i.get_oid();
+ ghobject_t oid = i.get_oid();
+ ghobject_t noid = i.get_oid();
uint64_t off = i.get_length();
uint64_t len = i.get_length();
f->dump_string("op_name", "clonerange");
@@ -237,8 +238,8 @@ void ObjectStore::Transaction::dump(ceph::Formatter *f)
case Transaction::OP_CLONERANGE2:
{
coll_t cid = i.get_cid();
- hobject_t oid = i.get_oid();
- hobject_t noid = i.get_oid();
+ ghobject_t oid = i.get_oid();
+ ghobject_t noid = i.get_oid();
uint64_t srcoff = i.get_length();
uint64_t len = i.get_length();
uint64_t dstoff = i.get_length();
@@ -272,7 +273,7 @@ void ObjectStore::Transaction::dump(ceph::Formatter *f)
{
coll_t ocid = i.get_cid();
coll_t ncid = i.get_cid();
- hobject_t oid = i.get_oid();
+ ghobject_t oid = i.get_oid();
f->dump_string("op_name", "collection_add");
f->dump_stream("src_collection") << ocid;
f->dump_stream("dst_collection") << ncid;
@@ -283,7 +284,7 @@ void ObjectStore::Transaction::dump(ceph::Formatter *f)
case Transaction::OP_COLL_REMOVE:
{
coll_t cid = i.get_cid();
- hobject_t oid = i.get_oid();
+ ghobject_t oid = i.get_oid();
f->dump_string("op_name", "collection_remove");
f->dump_stream("collection") << cid;
f->dump_stream("oid") << oid;
@@ -294,7 +295,7 @@ void ObjectStore::Transaction::dump(ceph::Formatter *f)
{
coll_t ocid = i.get_cid();
coll_t ncid = i.get_cid();
- hobject_t oid = i.get_oid();
+ ghobject_t oid = i.get_oid();
f->open_object_section("collection_move");
f->dump_stream("src_collection") << ocid;
f->dump_stream("dst_collection") << ncid;
@@ -344,7 +345,7 @@ void ObjectStore::Transaction::dump(ceph::Formatter *f)
case Transaction::OP_OMAP_CLEAR:
{
coll_t cid(i.get_cid());
- hobject_t oid = i.get_oid();
+ ghobject_t oid = i.get_oid();
f->dump_string("op_name", "omap_clear");
f->dump_stream("collection") << cid;
f->dump_stream("oid") << oid;
@@ -354,7 +355,7 @@ void ObjectStore::Transaction::dump(ceph::Formatter *f)
case Transaction::OP_OMAP_SETKEYS:
{
coll_t cid(i.get_cid());
- hobject_t oid = i.get_oid();
+ ghobject_t oid = i.get_oid();
map<string, bufferlist> aset;
i.get_attrset(aset);
f->dump_string("op_name", "omap_setkeys");
@@ -372,7 +373,7 @@ void ObjectStore::Transaction::dump(ceph::Formatter *f)
case Transaction::OP_OMAP_RMKEYS:
{
coll_t cid(i.get_cid());
- hobject_t oid = i.get_oid();
+ ghobject_t oid = i.get_oid();
set<string> keys;
i.get_keyset(keys);
f->dump_string("op_name", "omap_rmkeys");
@@ -384,7 +385,7 @@ void ObjectStore::Transaction::dump(ceph::Formatter *f)
case Transaction::OP_OMAP_SETHEADER:
{
coll_t cid(i.get_cid());
- hobject_t oid = i.get_oid();
+ ghobject_t oid = i.get_oid();
bufferlist bl;
i.get_bl(bl);
f->dump_string("op_name", "omap_setheader");
@@ -425,7 +426,7 @@ void ObjectStore::Transaction::dump(ceph::Formatter *f)
case Transaction::OP_OMAP_RMKEYRANGE:
{
coll_t cid(i.get_cid());
- hobject_t oid = i.get_oid();
+ ghobject_t oid = i.get_oid();
string first, last;
first = i.get_key();
last = i.get_key();
@@ -460,9 +461,9 @@ void ObjectStore::Transaction::generate_test_instances(list<ObjectStore::Transac
t = new Transaction;
coll_t c("foocoll");
coll_t c2("foocoll2");
- hobject_t o1("obj", "", 123, 456, -1, "");
- hobject_t o2("obj2", "", 123, 456, -1, "");
- hobject_t o3("obj3", "", 123, 456, -1, "");
+ ghobject_t o1(hobject_t("obj", "", 123, 456, -1, ""));
+ ghobject_t o2(hobject_t("obj2", "", 123, 456, -1, ""));
+ ghobject_t o3(hobject_t("obj3", "", 123, 456, -1, ""));
t->touch(c, o1);
bufferlist bl;
bl.append("some data");
@@ -497,3 +498,44 @@ void ObjectStore::Transaction::generate_test_instances(list<ObjectStore::Transac
o.push_back(t);
}
+int ObjectStore::collection_list(coll_t c, vector<hobject_t>& o)
+{
+ vector<ghobject_t> go;
+ int ret = collection_list(c, go);
+ if (ret == 0) {
+ o.reserve(go.size());
+ for (vector<ghobject_t>::iterator i = go.begin(); i != go.end() ; i++)
+ o.push_back(i->hobj);
+ }
+ return ret;
+}
+
+int ObjectStore::collection_list_partial(coll_t c, hobject_t start,
+ int min, int max, snapid_t snap,
+ vector<hobject_t> *ls, hobject_t *next)
+{
+ vector<ghobject_t> go;
+ ghobject_t gnext, gstart(start);
+ int ret = collection_list_partial(c, gstart, min, max, snap, &go, &gnext);
+ if (ret == 0) {
+ *next = gnext.hobj;
+ ls->reserve(go.size());
+ for (vector<ghobject_t>::iterator i = go.begin(); i != go.end() ; i++)
+ ls->push_back(i->hobj);
+ }
+ return ret;
+}
+
+int ObjectStore::collection_list_range(coll_t c, hobject_t start, hobject_t end,
+ snapid_t seq, vector<hobject_t> *ls)
+{
+ vector<ghobject_t> go;
+ ghobject_t gstart(start), gend(end);
+ int ret = collection_list_range(c, gstart, gend, seq, &go);
+ if (ret == 0) {
+ ls->reserve(go.size());
+ for (vector<ghobject_t>::iterator i = go.begin(); i != go.end() ; i++)
+ ls->push_back(i->hobj);
+ }
+ return ret;
+}
diff --git a/src/os/ObjectStore.h b/src/os/ObjectStore.h
index 7e8f6ce43bf..07473b344f5 100644
--- a/src/os/ObjectStore.h
+++ b/src/os/ObjectStore.h
@@ -340,21 +340,23 @@ public:
void get_bl(bufferlist& bl) {
::decode(bl, p);
}
- hobject_t get_oid() {
- hobject_t hoid;
+ ghobject_t get_oid() {
+ ghobject_t oid;
if (sobject_encoding) {
sobject_t soid;
::decode(soid, p);
- hoid.snap = soid.snap;
- hoid.oid = soid.oid;
+ oid.hobj.snap = soid.snap;
+ oid.hobj.oid = soid.oid;
+ oid.generation = ghobject_t::NO_GEN;
+ oid.shard_id = ghobject_t::NO_SHARD;
} else {
- ::decode(hoid, p);
+ ::decode(oid, p);
if (use_pool_override && pool_override != -1 &&
- hoid.pool == -1) {
- hoid.pool = pool_override;
+ oid.hobj.pool == -1) {
+ oid.hobj.pool = pool_override;
}
}
- return hoid;
+ return oid;
}
coll_t get_cid() {
coll_t c;
@@ -408,14 +410,14 @@ public:
::encode(op, tbl);
ops++;
}
- void touch(coll_t cid, const hobject_t& oid) {
+ void touch(coll_t cid, const ghobject_t& oid) {
__u32 op = OP_TOUCH;
::encode(op, tbl);
::encode(cid, tbl);
::encode(oid, tbl);
ops++;
}
- void write(coll_t cid, const hobject_t& oid, uint64_t off, uint64_t len, const bufferlist& data) {
+ void write(coll_t cid, const ghobject_t& oid, uint64_t off, uint64_t len, const bufferlist& data) {
__u32 op = OP_WRITE;
::encode(op, tbl);
::encode(cid, tbl);
@@ -431,7 +433,7 @@ public:
::encode(data, tbl);
ops++;
}
- void zero(coll_t cid, const hobject_t& oid, uint64_t off, uint64_t len) {
+ void zero(coll_t cid, const ghobject_t& oid, uint64_t off, uint64_t len) {
__u32 op = OP_ZERO;
::encode(op, tbl);
::encode(cid, tbl);
@@ -440,7 +442,7 @@ public:
::encode(len, tbl);
ops++;
}
- void truncate(coll_t cid, const hobject_t& oid, uint64_t off) {
+ void truncate(coll_t cid, const ghobject_t& oid, uint64_t off) {
__u32 op = OP_TRUNCATE;
::encode(op, tbl);
::encode(cid, tbl);
@@ -448,18 +450,18 @@ public:
::encode(off, tbl);
ops++;
}
- void remove(coll_t cid, const hobject_t& oid) {
+ void remove(coll_t cid, const ghobject_t& oid) {
__u32 op = OP_REMOVE;
::encode(op, tbl);
::encode(cid, tbl);
::encode(oid, tbl);
ops++;
}
- void setattr(coll_t cid, const hobject_t& oid, const char* name, bufferlist& val) {
+ void setattr(coll_t cid, const ghobject_t& oid, const char* name, bufferlist& val) {
string n(name);
setattr(cid, oid, n, val);
}
- void setattr(coll_t cid, const hobject_t& oid, const string& s, bufferlist& val) {
+ void setattr(coll_t cid, const ghobject_t& oid, const string& s, bufferlist& val) {
__u32 op = OP_SETATTR;
::encode(op, tbl);
::encode(cid, tbl);
@@ -468,7 +470,7 @@ public:
::encode(val, tbl);
ops++;
}
- void setattrs(coll_t cid, const hobject_t& oid, map<string,bufferptr>& attrset) {
+ void setattrs(coll_t cid, const ghobject_t& oid, map<string,bufferptr>& attrset) {
__u32 op = OP_SETATTRS;
::encode(op, tbl);
::encode(cid, tbl);
@@ -484,11 +486,11 @@ public:
::encode(attrset, tbl);
ops++;
}
- void rmattr(coll_t cid, const hobject_t& oid, const char *name) {
+ void rmattr(coll_t cid, const ghobject_t& oid, const char *name) {
string n(name);
rmattr(cid, oid, n);
}
- void rmattr(coll_t cid, const hobject_t& oid, const string& s) {
+ void rmattr(coll_t cid, const ghobject_t& oid, const string& s) {
__u32 op = OP_RMATTR;
::encode(op, tbl);
::encode(cid, tbl);
@@ -496,14 +498,14 @@ public:
::encode(s, tbl);
ops++;
}
- void rmattrs(coll_t cid, const hobject_t& oid) {
+ void rmattrs(coll_t cid, const ghobject_t& oid) {
__u32 op = OP_RMATTR;
::encode(op, tbl);
::encode(cid, tbl);
::encode(oid, tbl);
ops++;
}
- void clone(coll_t cid, const hobject_t& oid, hobject_t noid) {
+ void clone(coll_t cid, const ghobject_t& oid, ghobject_t noid) {
__u32 op = OP_CLONE;
::encode(op, tbl);
::encode(cid, tbl);
@@ -511,7 +513,7 @@ public:
::encode(noid, tbl);
ops++;
}
- void clone_range(coll_t cid, const hobject_t& oid, hobject_t noid,
+ void clone_range(coll_t cid, const ghobject_t& oid, ghobject_t noid,
uint64_t srcoff, uint64_t srclen, uint64_t dstoff) {
__u32 op = OP_CLONERANGE2;
::encode(op, tbl);
@@ -535,7 +537,7 @@ public:
::encode(cid, tbl);
ops++;
}
- void collection_add(coll_t cid, coll_t ocid, const hobject_t& oid) {
+ void collection_add(coll_t cid, coll_t ocid, const ghobject_t& oid) {
__u32 op = OP_COLL_ADD;
::encode(op, tbl);
::encode(cid, tbl);
@@ -543,20 +545,20 @@ public:
::encode(oid, tbl);
ops++;
}
- void collection_remove(coll_t cid, const hobject_t& oid) {
+ void collection_remove(coll_t cid, const ghobject_t& oid) {
__u32 op = OP_COLL_REMOVE;
::encode(op, tbl);
::encode(cid, tbl);
::encode(oid, tbl);
ops++;
}
- void collection_move(coll_t cid, coll_t oldcid, const hobject_t& oid) {
+ void collection_move(coll_t cid, coll_t oldcid, const ghobject_t& oid) {
collection_add(cid, oldcid, oid);
collection_remove(oldcid, oid);
return;
}
- void collection_move_rename(coll_t oldcid, const hobject_t& oldoid,
- coll_t cid, const hobject_t& oid) {
+ void collection_move_rename(coll_t oldcid, const ghobject_t& oldoid,
+ coll_t cid, const ghobject_t& oid) {
__u32 op = OP_COLL_MOVE_RENAME;
::encode(op, tbl);
::encode(oldcid, tbl);
@@ -611,55 +613,55 @@ public:
ops++;
}
- /// Remove omap from hoid
+ /// Remove omap from oid
void omap_clear(
- coll_t cid, ///< [in] Collection containing hoid
- const hobject_t &hoid ///< [in] Object from which to remove omap
+ coll_t cid, ///< [in] Collection containing oid
+ const ghobject_t &oid ///< [in] Object from which to remove omap
) {
__u32 op = OP_OMAP_CLEAR;
::encode(op, tbl);
::encode(cid, tbl);
- ::encode(hoid, tbl);
+ ::encode(oid, tbl);
ops++;
}
- /// Set keys on hoid omap. Replaces duplicate keys.
+ /// Set keys on oid omap. Replaces duplicate keys.
void omap_setkeys(
- coll_t cid, ///< [in] Collection containing hoid
- const hobject_t &hoid, ///< [in] Object to update
+ coll_t cid, ///< [in] Collection containing oid
+ const ghobject_t &oid, ///< [in] Object to update
const map<string, bufferlist> &attrset ///< [in] Replacement keys and values
) {
__u32 op = OP_OMAP_SETKEYS;
::encode(op, tbl);
::encode(cid, tbl);
- ::encode(hoid, tbl);
+ ::encode(oid, tbl);
::encode(attrset, tbl);
ops++;
}
- /// Remove keys from hoid omap
+ /// Remove keys from oid omap
void omap_rmkeys(
- coll_t cid, ///< [in] Collection containing hoid
- const hobject_t &hoid, ///< [in] Object from which to remove the omap
+ coll_t cid, ///< [in] Collection containing oid
+ const ghobject_t &oid, ///< [in] Object from which to remove the omap
const set<string> &keys ///< [in] Keys to clear
) {
__u32 op = OP_OMAP_RMKEYS;
::encode(op, tbl);
::encode(cid, tbl);
- ::encode(hoid, tbl);
+ ::encode(oid, tbl);
::encode(keys, tbl);
ops++;
}
- /// Remove key range from hoid omap
+ /// Remove key range from oid omap
void omap_rmkeyrange(
- coll_t cid, ///< [in] Collection containing hoid
- const hobject_t &hoid, ///< [in] Object from which to remove the omap
+ coll_t cid, ///< [in] Collection containing oid
+ const ghobject_t &oid, ///< [in] Object from which to remove the omap
const string& first, ///< [in] first key in range
const string& last ///< [in] first key past range
) {
__u32 op = OP_OMAP_RMKEYRANGE;
::encode(op, tbl);
::encode(cid, tbl);
- ::encode(hoid, tbl);
+ ::encode(oid, tbl);
::encode(first, tbl);
::encode(last, tbl);
ops++;
@@ -667,14 +669,14 @@ public:
/// Set omap header
void omap_setheader(
- coll_t cid, ///< [in] Collection containing hoid
- const hobject_t &hoid, ///< [in] Object from which to remove the omap
+ coll_t cid, ///< [in] Collection containing oid
+ const ghobject_t &oid, ///< [in] Object from which to remove the omap
const bufferlist &bl ///< [in] Header value
) {
__u32 op = OP_OMAP_SETHEADER;
::encode(op, tbl);
::encode(cid, tbl);
- ::encode(hoid, tbl);
+ ::encode(oid, tbl);
::encode(bl, tbl);
ops++;
}
@@ -857,6 +859,8 @@ public:
virtual int get_max_object_name_length() = 0;
virtual int mkfs() = 0; // wipe
virtual int mkjournal() = 0; // journal only
+ virtual void set_allow_sharded_objects() = 0;
+ virtual bool get_allow_sharded_objects() = 0;
virtual int statfs(struct statfs *buf) = 0;
@@ -875,32 +879,32 @@ public:
virtual int get_ideal_list_max() { return 64; }
// objects
- virtual bool exists(coll_t cid, const hobject_t& oid) = 0; // useful?
+ virtual bool exists(coll_t cid, const ghobject_t& oid) = 0; // useful?
virtual int stat(
coll_t cid,
- const hobject_t& oid,
+ const ghobject_t& oid,
struct stat *st,
bool allow_eio = false) = 0; // struct stat?
virtual int read(
coll_t cid,
- const hobject_t& oid,
+ const ghobject_t& oid,
uint64_t offset,
size_t len,
bufferlist& bl,
bool allow_eio = false) = 0;
- virtual int fiemap(coll_t cid, const hobject_t& oid, uint64_t offset, size_t len, bufferlist& bl) = 0;
+ virtual int fiemap(coll_t cid, const ghobject_t& oid, uint64_t offset, size_t len, bufferlist& bl) = 0;
- virtual int getattr(coll_t cid, const hobject_t& oid, const char *name, bufferptr& value) = 0;
- int getattr(coll_t cid, const hobject_t& oid, const char *name, bufferlist& value) {
+ virtual int getattr(coll_t cid, const ghobject_t& oid, const char *name, bufferptr& value) = 0;
+ int getattr(coll_t cid, const ghobject_t& oid, const char *name, bufferlist& value) {
bufferptr bp;
int r = getattr(cid, oid, name, bp);
if (bp.length())
value.push_back(bp);
return r;
}
- virtual int getattrs(coll_t cid, const hobject_t& oid, map<string,bufferptr>& aset, bool user_only = false) {return 0;};
+ virtual int getattrs(coll_t cid, const ghobject_t& oid, map<string,bufferptr>& aset, bool user_only = false) {return 0;};
// collections
@@ -915,7 +919,7 @@ public:
virtual int collection_getattr(coll_t cid, const char *name, bufferlist& bl) = 0;
virtual int collection_getattrs(coll_t cid, map<string,bufferptr> &aset) = 0;
virtual bool collection_empty(coll_t c) = 0;
- virtual int collection_list(coll_t c, vector<hobject_t>& o) = 0;
+ virtual int collection_list(coll_t c, vector<ghobject_t>& o) = 0;
/**
* list partial contents of collection relative to a hash offset/position
@@ -929,9 +933,9 @@ public:
* @param next [out] next item sorts >= this value
* @return zero on success, or negative error
*/
- virtual int collection_list_partial(coll_t c, hobject_t start,
+ virtual int collection_list_partial(coll_t c, ghobject_t start,
int min, int max, snapid_t snap,
- vector<hobject_t> *ls, hobject_t *next) = 0;
+ vector<ghobject_t> *ls, ghobject_t *next) = 0;
/**
* list contents of a collection that fall in the range [start, end)
@@ -943,47 +947,57 @@ public:
* @param ls [out] result
* @return zero on success, or negative error
*/
- virtual int collection_list_range(coll_t c, hobject_t start, hobject_t end,
- snapid_t seq, vector<hobject_t> *ls) = 0;
+ virtual int collection_list_range(coll_t c, ghobject_t start, ghobject_t end,
+ snapid_t seq, vector<ghobject_t> *ls) = 0;
+
+ //TODO: Remove
+ int collection_list(coll_t c, vector<hobject_t>& o);
+
+ int collection_list_partial(coll_t c, hobject_t start,
+ int min, int max, snapid_t snap,
+ vector<hobject_t> *ls, hobject_t *next);
+
+ int collection_list_range(coll_t c, hobject_t start, hobject_t end,
+ snapid_t seq, vector<hobject_t> *ls);
/// OMAP
/// Get omap contents
virtual int omap_get(
- coll_t c, ///< [in] Collection containing hoid
- const hobject_t &hoid, ///< [in] Object containing omap
+ coll_t c, ///< [in] Collection containing oid
+ const ghobject_t &oid, ///< [in] Object containing omap
bufferlist *header, ///< [out] omap header
map<string, bufferlist> *out /// < [out] Key to value map
) = 0;
/// Get omap header
virtual int omap_get_header(
- coll_t c, ///< [in] Collection containing hoid
- const hobject_t &hoid, ///< [in] Object containing omap
+ coll_t c, ///< [in] Collection containing oid
+ const ghobject_t &oid, ///< [in] Object containing omap
bufferlist *header, ///< [out] omap header
bool allow_eio = false ///< [in] don't assert on eio
) = 0;
- /// Get keys defined on hoid
+ /// Get keys defined on oid
virtual int omap_get_keys(
- coll_t c, ///< [in] Collection containing hoid
- const hobject_t &hoid, ///< [in] Object containing omap
- set<string> *keys ///< [out] Keys defined on hoid
+ coll_t c, ///< [in] Collection containing oid
+ const ghobject_t &oid, ///< [in] Object containing omap
+ set<string> *keys ///< [out] Keys defined on oid
) = 0;
/// Get key values
virtual int omap_get_values(
- coll_t c, ///< [in] Collection containing hoid
- const hobject_t &hoid, ///< [in] Object containing omap
+ coll_t c, ///< [in] Collection containing oid
+ const ghobject_t &oid, ///< [in] Object containing omap
const set<string> &keys, ///< [in] Keys to get
map<string, bufferlist> *out ///< [out] Returned keys and values
) = 0;
- /// Filters keys into out which are defined on hoid
+ /// Filters keys into out which are defined on oid
virtual int omap_check_keys(
- coll_t c, ///< [in] Collection containing hoid
- const hobject_t &hoid, ///< [in] Object containing omap
+ coll_t c, ///< [in] Collection containing oid
+ const ghobject_t &oid, ///< [in] Object containing omap
const set<string> &keys, ///< [in] Keys to check
- set<string> *out ///< [out] Subset of keys defined on hoid
+ set<string> *out ///< [out] Subset of keys defined on oid
) = 0;
/**
@@ -997,7 +1011,7 @@ public:
*/
virtual ObjectMap::ObjectMapIterator get_omap_iterator(
coll_t c, ///< [in] collection
- const hobject_t &hoid ///< [in] object
+ const ghobject_t &oid ///< [in] object
) = 0;
virtual void sync(Context *onsync) {}
@@ -1013,8 +1027,8 @@ public:
virtual uuid_d get_fsid() = 0;
// DEBUG
- virtual void inject_data_error(const hobject_t &oid) {}
- virtual void inject_mdata_error(const hobject_t &oid) {}
+ virtual void inject_data_error(const ghobject_t &oid) {}
+ virtual void inject_mdata_error(const ghobject_t &oid) {}
};
diff --git a/src/os/WBThrottle.cc b/src/os/WBThrottle.cc
index 8479b3c878d..e02c17677bb 100644
--- a/src/os/WBThrottle.cc
+++ b/src/os/WBThrottle.cc
@@ -116,7 +116,7 @@ void WBThrottle::handle_conf_change(const md_config_t *conf,
}
bool WBThrottle::get_next_should_flush(
- boost::tuple<hobject_t, FDRef, PendingWB> *next)
+ boost::tuple<ghobject_t, FDRef, PendingWB> *next)
{
assert(lock.is_locked());
assert(next);
@@ -128,9 +128,9 @@ bool WBThrottle::get_next_should_flush(
if (stopping)
return false;
assert(!pending_wbs.empty());
- hobject_t obj(pop_object());
+ ghobject_t obj(pop_object());
- map<hobject_t, pair<PendingWB, FDRef> >::iterator i =
+ map<ghobject_t, pair<PendingWB, FDRef> >::iterator i =
pending_wbs.find(obj);
*next = boost::make_tuple(obj, i->second.second, i->second.first);
pending_wbs.erase(i);
@@ -141,7 +141,7 @@ bool WBThrottle::get_next_should_flush(
void *WBThrottle::entry()
{
Mutex::Locker l(lock);
- boost::tuple<hobject_t, FDRef, PendingWB> wb;
+ boost::tuple<ghobject_t, FDRef, PendingWB> wb;
while (get_next_should_flush(&wb)) {
clearing = wb.get<0>();
lock.Unlock();
@@ -149,24 +149,24 @@ void *WBThrottle::entry()
if (wb.get<2>().nocache)
posix_fadvise(**wb.get<1>(), 0, 0, POSIX_FADV_DONTNEED);
lock.Lock();
- clearing = hobject_t();
+ clearing = ghobject_t();
cur_ios -= wb.get<2>().ios;
logger->dec(l_wbthrottle_ios_dirtied, wb.get<2>().ios);
cur_size -= wb.get<2>().size;
logger->dec(l_wbthrottle_bytes_dirtied, wb.get<2>().size);
logger->dec(l_wbthrottle_inodes_dirtied);
cond.Signal();
- wb = boost::tuple<hobject_t, FDRef, PendingWB>();
+ wb = boost::tuple<ghobject_t, FDRef, PendingWB>();
}
return 0;
}
void WBThrottle::queue_wb(
- FDRef fd, const hobject_t &hoid, uint64_t offset, uint64_t len,
+ FDRef fd, const ghobject_t &hoid, uint64_t offset, uint64_t len,
bool nocache)
{
Mutex::Locker l(lock);
- map<hobject_t, pair<PendingWB, FDRef> >::iterator wbiter =
+ map<ghobject_t, pair<PendingWB, FDRef> >::iterator wbiter =
pending_wbs.find(hoid);
if (wbiter == pending_wbs.end()) {
wbiter = pending_wbs.insert(
@@ -192,7 +192,7 @@ void WBThrottle::queue_wb(
void WBThrottle::clear()
{
Mutex::Locker l(lock);
- for (map<hobject_t, pair<PendingWB, FDRef> >::iterator i =
+ for (map<ghobject_t, pair<PendingWB, FDRef> >::iterator i =
pending_wbs.begin();
i != pending_wbs.end();
++i) {
@@ -208,12 +208,12 @@ void WBThrottle::clear()
cond.Signal();
}
-void WBThrottle::clear_object(const hobject_t &hoid)
+void WBThrottle::clear_object(const ghobject_t &hoid)
{
Mutex::Locker l(lock);
while (clearing == hoid)
cond.Wait(lock);
- map<hobject_t, pair<PendingWB, FDRef> >::iterator i =
+ map<ghobject_t, pair<PendingWB, FDRef> >::iterator i =
pending_wbs.find(hoid);
if (i == pending_wbs.end())
return;
diff --git a/src/os/WBThrottle.h b/src/os/WBThrottle.h
index d480a6b751c..e418cf98d2a 100644
--- a/src/os/WBThrottle.h
+++ b/src/os/WBThrottle.h
@@ -44,7 +44,7 @@ enum {
* Tracks, throttles, and flushes outstanding IO
*/
class WBThrottle : Thread, public md_config_obs_t {
- hobject_t clearing;
+ ghobject_t clearing;
/* *_limits.first is the start_flusher limit and
* *_limits.second is the hard limit
@@ -89,36 +89,36 @@ class WBThrottle : Thread, public md_config_obs_t {
/**
* Flush objects in lru order
*/
- list<hobject_t> lru;
- map<hobject_t, list<hobject_t>::iterator> rev_lru;
- void remove_object(const hobject_t &hoid) {
+ list<ghobject_t> lru;
+ map<ghobject_t, list<ghobject_t>::iterator> rev_lru;
+ void remove_object(const ghobject_t &oid) {
assert(lock.is_locked());
- map<hobject_t, list<hobject_t>::iterator>::iterator iter =
- rev_lru.find(hoid);
+ map<ghobject_t, list<ghobject_t>::iterator>::iterator iter =
+ rev_lru.find(oid);
if (iter == rev_lru.end())
return;
lru.erase(iter->second);
rev_lru.erase(iter);
}
- hobject_t pop_object() {
+ ghobject_t pop_object() {
assert(!lru.empty());
- hobject_t hoid(lru.front());
+ ghobject_t oid(lru.front());
lru.pop_front();
- rev_lru.erase(hoid);
- return hoid;
+ rev_lru.erase(oid);
+ return oid;
}
- void insert_object(const hobject_t &hoid) {
- assert(rev_lru.find(hoid) == rev_lru.end());
- lru.push_back(hoid);
- rev_lru.insert(make_pair(hoid, --lru.end()));
+ void insert_object(const ghobject_t &oid) {
+ assert(rev_lru.find(oid) == rev_lru.end());
+ lru.push_back(oid);
+ rev_lru.insert(make_pair(oid, --lru.end()));
}
- map<hobject_t, pair<PendingWB, FDRef> > pending_wbs;
+ map<ghobject_t, pair<PendingWB, FDRef> > pending_wbs;
/// get next flush to perform
bool get_next_should_flush(
- boost::tuple<hobject_t, FDRef, PendingWB> *next ///< [out] next to flush
+ boost::tuple<ghobject_t, FDRef, PendingWB> *next ///< [out] next to flush
); ///< @return false if we are shutting down
public:
enum FS {
@@ -141,10 +141,10 @@ public:
set_from_conf();
}
- /// Queue wb on hoid, fd taking throttle (does not block)
+ /// Queue wb on oid, fd taking throttle (does not block)
void queue_wb(
- FDRef fd, ///< [in] FDRef to hoid
- const hobject_t &hoid, ///< [in] object
+ FDRef fd, ///< [in] FDRef to oid
+ const ghobject_t &oid, ///< [in] object
uint64_t offset, ///< [in] offset written
uint64_t len, ///< [in] length written
bool nocache ///< [in] try to clear out of cache after write
@@ -154,7 +154,7 @@ public:
void clear();
/// Clear object
- void clear_object(const hobject_t &hoid);
+ void clear_object(const ghobject_t &oid);
/// Block until there is throttle available
void throttle();
diff --git a/src/osd/ErasureCodePluginJerasure/ErasureCodeJerasure.cc b/src/osd/ErasureCodePluginJerasure/ErasureCodeJerasure.cc
index aa22144bb6e..f2be1ed06e7 100644
--- a/src/osd/ErasureCodePluginJerasure/ErasureCodeJerasure.cc
+++ b/src/osd/ErasureCodePluginJerasure/ErasureCodeJerasure.cc
@@ -35,7 +35,8 @@ static ostream& _prefix(std::ostream* _dout)
return *_dout << "ErasureCodeJerasure: ";
}
-void ErasureCodeJerasure::init(const map<std::string,std::string> &parameters) {
+void ErasureCodeJerasure::init(const map<std::string,std::string> &parameters)
+{
dout(10) << "technique=" << technique << dendl;
parse(parameters);
prepare();
@@ -43,7 +44,8 @@ void ErasureCodeJerasure::init(const map<std::string,std::string> &parameters) {
int ErasureCodeJerasure::minimum_to_decode(const set<int> &want_to_read,
const set<int> &available_chunks,
- set<int> *minimum) {
+ set<int> *minimum)
+{
if (includes(available_chunks.begin(), available_chunks.end(),
want_to_read.begin(), want_to_read.end())) {
*minimum = want_to_read;
@@ -60,7 +62,8 @@ int ErasureCodeJerasure::minimum_to_decode(const set<int> &want_to_read,
int ErasureCodeJerasure::minimum_to_decode_with_cost(const set<int> &want_to_read,
const map<int, int> &available,
- set<int> *minimum) {
+ set<int> *minimum)
+{
set <int> available_chunks;
for (map<int, int>::const_iterator i = available.begin();
i != available.end();
@@ -71,39 +74,38 @@ int ErasureCodeJerasure::minimum_to_decode_with_cost(const set<int> &want_to_rea
int ErasureCodeJerasure::encode(const set<int> &want_to_encode,
const bufferlist &in,
- map<int, bufferlist> *encoded) {
- unsigned in_length = pad_in_length(in.length());
- dout(10) << "encode adjusted buffer length from " << in.length() << " to " << in_length << dendl;
- assert(in_length % k == 0);
- unsigned blocksize = in_length / k;
+ map<int, bufferlist> *encoded)
+{
+ unsigned alignment = get_alignment();
+ unsigned tail = in.length() % alignment;
+ unsigned padded_length = in.length() + ( tail ? ( alignment - tail ) : 0 );
+ dout(10) << "encode adjusted buffer length from " << in.length()
+ << " to " << padded_length << dendl;
+ assert(padded_length % k == 0);
+ unsigned blocksize = padded_length / k;
unsigned length = blocksize * ( k + m );
bufferlist out(in);
bufferptr pad(length - in.length());
- pad.zero(0, k);
+ pad.zero(0, padded_length - in.length());
out.push_back(pad);
- char *p = out.c_str();
- char *data[k];
- for (int i = 0; i < k; i++) {
- data[i] = p + i * blocksize;
+ char *chunks[k + m];
+ for (int i = 0; i < k + m; i++) {
+ bufferlist &chunk = (*encoded)[i];
+ chunk.substr_of(out, i * blocksize, blocksize);
+ chunks[i] = chunk.c_str();
}
- char *coding[m];
- for (int i = 0; i < m; i++) {
- coding[i] = p + ( k + i ) * blocksize;
- }
- jerasure_encode(data, coding, blocksize);
- const bufferptr ptr = out.buffers().front();
- for (set<int>::iterator j = want_to_encode.begin();
- j != want_to_encode.end();
- j++) {
- bufferptr chunk(ptr, (*j) * blocksize, blocksize);
- (*encoded)[*j].push_front(chunk);
+ jerasure_encode(&chunks[0], &chunks[k], blocksize);
+ for (int i = 0; i < k + m; i++) {
+ if (want_to_encode.count(i) == 0)
+ encoded->erase(i);
}
return 0;
}
int ErasureCodeJerasure::decode(const set<int> &want_to_read,
const map<int, bufferlist> &chunks,
- map<int, bufferlist> *decoded) {
+ map<int, bufferlist> *decoded)
+{
unsigned blocksize = (*chunks.begin()).second.length();
int erasures[k + m + 1];
int erasures_count = 0;
@@ -133,7 +135,8 @@ int ErasureCodeJerasure::decode(const set<int> &want_to_read,
int ErasureCodeJerasure::to_int(const std::string &name,
const map<std::string,std::string> &parameters,
- int default_value) {
+ int default_value)
+{
if (parameters.find(name) == parameters.end() ||
parameters.find(name)->second.size() == 0) {
dout(10) << name << " defaults to " << default_value << dendl;
@@ -153,7 +156,8 @@ int ErasureCodeJerasure::to_int(const std::string &name,
return r;
}
-bool ErasureCodeJerasure::is_prime(int value) {
+bool ErasureCodeJerasure::is_prime(int value)
+{
int prime55[] = {
2,3,5,7,11,13,17,19,23,29,31,37,41,43,47,53,59,61,67,71,
73,79,83,89,97,101,103,107,109,113,127,131,137,139,149,
@@ -172,34 +176,39 @@ bool ErasureCodeJerasure::is_prime(int value) {
//
void ErasureCodeJerasureReedSolomonVandermonde::jerasure_encode(char **data,
char **coding,
- int blocksize) {
+ int blocksize)
+{
jerasure_matrix_encode(k, m, w, matrix, data, coding, blocksize);
}
int ErasureCodeJerasureReedSolomonVandermonde::jerasure_decode(int *erasures,
char **data,
char **coding,
- int blocksize) {
- return jerasure_matrix_decode(k, m, w, matrix, 1, erasures, data, coding, blocksize);
+ int blocksize)
+{
+ return jerasure_matrix_decode(k, m, w, matrix, 1,
+ erasures, data, coding, blocksize);
}
-unsigned ErasureCodeJerasureReedSolomonVandermonde::pad_in_length(unsigned in_length) {
- while (in_length%(k*w*sizeof(int)) != 0)
- in_length++;
- return in_length;
+unsigned ErasureCodeJerasureReedSolomonVandermonde::get_alignment()
+{
+ return k*w*sizeof(int);
}
-void ErasureCodeJerasureReedSolomonVandermonde::parse(const map<std::string,std::string> &parameters) {
+void ErasureCodeJerasureReedSolomonVandermonde::parse(const map<std::string,std::string> &parameters)
+{
k = to_int("erasure-code-k", parameters, DEFAULT_K);
m = to_int("erasure-code-m", parameters, DEFAULT_M);
w = to_int("erasure-code-w", parameters, DEFAULT_W);
if (w != 8 && w != 16 && w != 32) {
- derr << "ReedSolomonVandermonde: w=" << w << " must be one of {8, 16, 32} : revert to 8 " << dendl;
+ derr << "ReedSolomonVandermonde: w=" << w
+ << " must be one of {8, 16, 32} : revert to 8 " << dendl;
w = 8;
}
}
-void ErasureCodeJerasureReedSolomonVandermonde::prepare() {
+void ErasureCodeJerasureReedSolomonVandermonde::prepare()
+{
matrix = reed_sol_vandermonde_coding_matrix(k, m, w);
}
@@ -208,34 +217,38 @@ void ErasureCodeJerasureReedSolomonVandermonde::prepare() {
//
void ErasureCodeJerasureReedSolomonRAID6::jerasure_encode(char **data,
char **coding,
- int blocksize) {
+ int blocksize)
+{
reed_sol_r6_encode(k, w, data, coding, blocksize);
}
int ErasureCodeJerasureReedSolomonRAID6::jerasure_decode(int *erasures,
- char **data,
- char **coding,
- int blocksize) {
+ char **data,
+ char **coding,
+ int blocksize)
+{
return jerasure_matrix_decode(k, m, w, matrix, 1, erasures, data, coding, blocksize);
}
-unsigned ErasureCodeJerasureReedSolomonRAID6::pad_in_length(unsigned in_length) {
- while (in_length%(k*w*sizeof(int)) != 0)
- in_length++;
- return in_length;
+unsigned ErasureCodeJerasureReedSolomonRAID6::get_alignment()
+{
+ return k*w*sizeof(int);
}
-void ErasureCodeJerasureReedSolomonRAID6::parse(const map<std::string,std::string> &parameters) {
+void ErasureCodeJerasureReedSolomonRAID6::parse(const map<std::string,std::string> &parameters)
+{
k = to_int("erasure-code-k", parameters, DEFAULT_K);
m = 2;
w = to_int("erasure-code-w", parameters, DEFAULT_W);
if (w != 8 && w != 16 && w != 32) {
- derr << "ReedSolomonRAID6: w=" << w << " must be one of {8, 16, 32} : revert to 8 " << dendl;
+ derr << "ReedSolomonRAID6: w=" << w
+ << " must be one of {8, 16, 32} : revert to 8 " << dendl;
w = 8;
}
}
-void ErasureCodeJerasureReedSolomonRAID6::prepare() {
+void ErasureCodeJerasureReedSolomonRAID6::prepare()
+{
matrix = reed_sol_r6_coding_matrix(k, w);
}
@@ -243,32 +256,37 @@ void ErasureCodeJerasureReedSolomonRAID6::prepare() {
// ErasureCodeJerasureCauchy
//
void ErasureCodeJerasureCauchy::jerasure_encode(char **data,
- char **coding,
- int blocksize) {
- jerasure_schedule_encode(k, m, w, schedule, data, coding, blocksize, packetsize);
+ char **coding,
+ int blocksize)
+{
+ jerasure_schedule_encode(k, m, w, schedule,
+ data, coding, blocksize, packetsize);
}
int ErasureCodeJerasureCauchy::jerasure_decode(int *erasures,
- char **data,
- char **coding,
- int blocksize) {
- return jerasure_schedule_decode_lazy(k, m, w, bitmatrix, erasures, data, coding, blocksize, packetsize, 1);
+ char **data,
+ char **coding,
+ int blocksize)
+{
+ return jerasure_schedule_decode_lazy(k, m, w, bitmatrix,
+ erasures, data, coding, blocksize, packetsize, 1);
}
-unsigned ErasureCodeJerasureCauchy::pad_in_length(unsigned in_length) {
- while (in_length%(k*w*packetsize*sizeof(int)) != 0)
- in_length++;
- return in_length;
+unsigned ErasureCodeJerasureCauchy::get_alignment()
+{
+ return k*w*packetsize*sizeof(int);
}
-void ErasureCodeJerasureCauchy::parse(const map<std::string,std::string> &parameters) {
+void ErasureCodeJerasureCauchy::parse(const map<std::string,std::string> &parameters)
+{
k = to_int("erasure-code-k", parameters, DEFAULT_K);
m = to_int("erasure-code-m", parameters, DEFAULT_M);
w = to_int("erasure-code-w", parameters, DEFAULT_W);
packetsize = to_int("erasure-code-packetsize", parameters, DEFAULT_PACKETSIZE);
}
-void ErasureCodeJerasureCauchy::prepare_schedule(int *matrix) {
+void ErasureCodeJerasureCauchy::prepare_schedule(int *matrix)
+{
bitmatrix = jerasure_matrix_to_bitmatrix(k, m, w, matrix);
schedule = jerasure_smart_bitmatrix_to_schedule(k, m, w, bitmatrix);
}
@@ -276,7 +294,8 @@ void ErasureCodeJerasureCauchy::prepare_schedule(int *matrix) {
//
// ErasureCodeJerasureCauchyOrig
//
-void ErasureCodeJerasureCauchyOrig::prepare() {
+void ErasureCodeJerasureCauchyOrig::prepare()
+{
int *matrix = cauchy_original_coding_matrix(k, m, w);
prepare_schedule(matrix);
free(matrix);
@@ -285,7 +304,8 @@ void ErasureCodeJerasureCauchyOrig::prepare() {
//
// ErasureCodeJerasureCauchyGood
//
-void ErasureCodeJerasureCauchyGood::prepare() {
+void ErasureCodeJerasureCauchyGood::prepare()
+{
int *matrix = cauchy_good_general_coding_matrix(k, m, w);
prepare_schedule(matrix);
free(matrix);
@@ -294,7 +314,8 @@ void ErasureCodeJerasureCauchyGood::prepare() {
//
// ErasureCodeJerasureLiberation
//
-ErasureCodeJerasureLiberation::~ErasureCodeJerasureLiberation() {
+ErasureCodeJerasureLiberation::~ErasureCodeJerasureLiberation()
+{
if (bitmatrix)
free(bitmatrix);
if (schedule)
@@ -303,24 +324,28 @@ ErasureCodeJerasureLiberation::~ErasureCodeJerasureLiberation() {
void ErasureCodeJerasureLiberation::jerasure_encode(char **data,
char **coding,
- int blocksize) {
- jerasure_schedule_encode(k, m, w, schedule, data, coding, blocksize, packetsize);
+ int blocksize)
+{
+ jerasure_schedule_encode(k, m, w, schedule, data,
+ coding, blocksize, packetsize);
}
int ErasureCodeJerasureLiberation::jerasure_decode(int *erasures,
char **data,
char **coding,
- int blocksize) {
- return jerasure_schedule_decode_lazy(k, m, w, bitmatrix, erasures, data, coding, blocksize, packetsize, 1);
+ int blocksize)
+{
+ return jerasure_schedule_decode_lazy(k, m, w, bitmatrix, erasures, data,
+ coding, blocksize, packetsize, 1);
}
-unsigned ErasureCodeJerasureLiberation::pad_in_length(unsigned in_length) {
- while (in_length%(k*w*packetsize*sizeof(int)) != 0)
- in_length++;
- return in_length;
+unsigned ErasureCodeJerasureLiberation::get_alignment()
+{
+ return k*w*packetsize*sizeof(int);
}
-void ErasureCodeJerasureLiberation::parse(const map<std::string,std::string> &parameters) {
+void ErasureCodeJerasureLiberation::parse(const map<std::string,std::string> &parameters)
+{
k = to_int("erasure-code-k", parameters, DEFAULT_K);
m = to_int("erasure-code-m", parameters, DEFAULT_M);
w = to_int("erasure-code-w", parameters, DEFAULT_W);
@@ -340,18 +365,21 @@ void ErasureCodeJerasureLiberation::parse(const map<std::string,std::string> &pa
error = true;
}
if ((packetsize%(sizeof(int))) != 0) {
- derr << "packetsize=" << packetsize << " must be a multiple of sizeof(int) = " << sizeof(int) << dendl;
+ derr << "packetsize=" << packetsize
+ << " must be a multiple of sizeof(int) = " << sizeof(int) << dendl;
error = true;
}
if (error) {
- derr << "reverting to k=" << DEFAULT_K << ", w=" << DEFAULT_W << ", packetsize=" << DEFAULT_PACKETSIZE << dendl;
+ derr << "reverting to k=" << DEFAULT_K << ", w="
+ << DEFAULT_W << ", packetsize=" << DEFAULT_PACKETSIZE << dendl;
k = DEFAULT_K;
w = DEFAULT_W;
packetsize = DEFAULT_PACKETSIZE;
}
}
-void ErasureCodeJerasureLiberation::prepare() {
+void ErasureCodeJerasureLiberation::prepare()
+{
bitmatrix = liberation_coding_bitmatrix(k, w);
schedule = jerasure_smart_bitmatrix_to_schedule(k, m, w, bitmatrix);
}
@@ -359,7 +387,8 @@ void ErasureCodeJerasureLiberation::prepare() {
//
// ErasureCodeJerasureBlaumRoth
//
-void ErasureCodeJerasureBlaumRoth::prepare() {
+void ErasureCodeJerasureBlaumRoth::prepare()
+{
bitmatrix = blaum_roth_coding_bitmatrix(k, w);
schedule = jerasure_smart_bitmatrix_to_schedule(k, m, w, bitmatrix);
}
@@ -367,7 +396,8 @@ void ErasureCodeJerasureBlaumRoth::prepare() {
//
// ErasureCodeJerasureLiber8tion
//
-void ErasureCodeJerasureLiber8tion::parse(const map<std::string,std::string> &parameters) {
+void ErasureCodeJerasureLiber8tion::parse(const map<std::string,std::string> &parameters)
+{
k = to_int("erasure-code-k", parameters, DEFAULT_K);
m = DEFAULT_M;
w = DEFAULT_W;
@@ -383,13 +413,15 @@ void ErasureCodeJerasureLiber8tion::parse(const map<std::string,std::string> &pa
error = true;
}
if (error) {
- derr << "reverting to k=" << DEFAULT_K << ", packetsize=" << DEFAULT_PACKETSIZE << dendl;
+ derr << "reverting to k=" << DEFAULT_K << ", packetsize="
+ << DEFAULT_PACKETSIZE << dendl;
k = DEFAULT_K;
packetsize = DEFAULT_PACKETSIZE;
}
}
-void ErasureCodeJerasureLiber8tion::prepare() {
+void ErasureCodeJerasureLiber8tion::prepare()
+{
bitmatrix = liber8tion_coding_bitmatrix(k);
schedule = jerasure_smart_bitmatrix_to_schedule(k, m, w, bitmatrix);
}
diff --git a/src/osd/ErasureCodePluginJerasure/ErasureCodeJerasure.h b/src/osd/ErasureCodePluginJerasure/ErasureCodeJerasure.h
index 7728751c383..fc76ed7b1e2 100644
--- a/src/osd/ErasureCodePluginJerasure/ErasureCodeJerasure.h
+++ b/src/osd/ErasureCodePluginJerasure/ErasureCodeJerasure.h
@@ -56,7 +56,7 @@ public:
char **data,
char **coding,
int blocksize) = 0;
- virtual unsigned pad_in_length(unsigned in_length) = 0;
+ virtual unsigned get_alignment() = 0;
virtual void parse(const map<std::string,std::string> &parameters) = 0;
virtual void prepare() = 0;
static int to_int(const std::string &name,
@@ -88,7 +88,7 @@ public:
char **data,
char **coding,
int blocksize);
- virtual unsigned pad_in_length(unsigned in_length);
+ virtual unsigned get_alignment();
virtual void parse(const map<std::string,std::string> &parameters);
virtual void prepare();
};
@@ -115,7 +115,7 @@ public:
char **data,
char **coding,
int blocksize);
- virtual unsigned pad_in_length(unsigned in_length);
+ virtual unsigned get_alignment();
virtual void parse(const map<std::string,std::string> &parameters);
virtual void prepare();
};
@@ -149,7 +149,7 @@ public:
char **data,
char **coding,
int blocksize);
- virtual unsigned pad_in_length(unsigned in_length);
+ virtual unsigned get_alignment();
virtual void parse(const map<std::string,std::string> &parameters);
void prepare_schedule(int *matrix);
};
@@ -196,7 +196,7 @@ public:
char **data,
char **coding,
int blocksize);
- virtual unsigned pad_in_length(unsigned in_length);
+ virtual unsigned get_alignment();
virtual void parse(const map<std::string,std::string> &parameters);
virtual void prepare();
};
diff --git a/src/osd/Makefile.am b/src/osd/Makefile.am
index ea7c036f858..cae02015fce 100644
--- a/src/osd/Makefile.am
+++ b/src/osd/Makefile.am
@@ -9,12 +9,14 @@ libosd_la_SOURCES = \
osd/PG.cc \
osd/PGLog.cc \
osd/ReplicatedPG.cc \
+ osd/ReplicatedBackend.cc \
osd/Ager.cc \
osd/OSD.cc \
osd/OSDCap.cc \
osd/Watch.cc \
osd/ClassHandler.cc \
osd/OpRequest.cc \
+ common/TrackedOp.cc \
osd/SnapMapper.cc \
osd/osd_types.cc \
objclass/class_api.cc
@@ -35,6 +37,8 @@ noinst_HEADERS += \
osd/PG.h \
osd/PGLog.h \
osd/ReplicatedPG.h \
+ osd/PGBackend.h \
+ osd/ReplicatedBackend.h \
osd/Watch.h \
osd/osd_types.h
diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc
index 1cdc232b064..b2aa2ebbcd2 100644
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -134,7 +134,9 @@ static ostream& _prefix(std::ostream* _dout, int whoami, OSDMapRef osdmap) {
<< " ";
}
-static CompatSet get_osd_compat_set() {
+//Initial features in new superblock.
+//Features here are also automatically upgraded
+CompatSet OSD::get_osd_initial_compat_set() {
CompatSet::FeatureSet ceph_osd_feature_compat;
CompatSet::FeatureSet ceph_osd_feature_ro_compat;
CompatSet::FeatureSet ceph_osd_feature_incompat;
@@ -152,6 +154,14 @@ static CompatSet get_osd_compat_set() {
ceph_osd_feature_incompat);
}
+//Features are added here that this OSD supports.
+CompatSet OSD::get_osd_compat_set() {
+ CompatSet compat = get_osd_initial_compat_set();
+ //Any features here can be set in code, but not in initial superblock
+ compat.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
+ return compat;
+}
+
OSDService::OSDService(OSD *osd) :
osd(osd),
cct(osd->cct),
@@ -170,6 +180,7 @@ OSDService::OSDService(OSD *osd) :
scrub_wq(osd->scrub_wq),
scrub_finalize_wq(osd->scrub_finalize_wq),
rep_scrub_wq(osd->rep_scrub_wq),
+ push_wq("push_wq", cct->_conf->osd_recovery_thread_timeout, &osd->recovery_tp),
class_handler(osd->class_handler),
publish_lock("OSDService::publish_lock"),
pre_publish_lock("OSDService::pre_publish_lock"),
@@ -423,6 +434,7 @@ void OSDService::init()
objecter_timer.init();
objecter->set_client_incarnation(0);
objecter->init_locked();
+ objecter->unset_honor_cache_redirects();
}
watch_timer.init();
}
@@ -449,7 +461,7 @@ int OSD::convert_collection(ObjectStore *store, coll_t cid)
{
coll_t tmp0("convertfs_temp");
coll_t tmp1("convertfs_temp1");
- vector<hobject_t> objects;
+ vector<ghobject_t> objects;
map<string, bufferptr> aset;
int r = store->collection_getattrs(cid, aset);
@@ -469,10 +481,10 @@ int OSD::convert_collection(ObjectStore *store, coll_t cid)
store->apply_transaction(t);
}
- hobject_t next;
+ ghobject_t next;
while (!next.is_max()) {
objects.clear();
- hobject_t start = next;
+ ghobject_t start = next;
r = store->collection_list_partial(cid, start,
200, 300, 0,
&objects, &next);
@@ -480,7 +492,7 @@ int OSD::convert_collection(ObjectStore *store, coll_t cid)
return r;
ObjectStore::Transaction t;
- for (vector<hobject_t>::iterator i = objects.begin();
+ for (vector<ghobject_t>::iterator i = objects.begin();
i != objects.end();
++i) {
t.collection_add(tmp0, cid, *i);
@@ -646,7 +658,7 @@ int OSD::mkfs(CephContext *cct, const std::string &dev, const std::string &jdev,
sb.cluster_fsid = fsid;
sb.osd_fsid = store->get_fsid();
sb.whoami = whoami;
- sb.compat_features = get_osd_compat_set();
+ sb.compat_features = get_osd_initial_compat_set();
// benchmark?
if (cct->_conf->osd_auto_weight) {
@@ -701,7 +713,7 @@ int OSD::mkfs(CephContext *cct, const std::string &dev, const std::string &jdev,
goto umount_store;
}
- ret = write_meta(dev, "ready", "ready\n", 6);
+ ret = safe_write_file(dev.c_str(), "ready", "ready\n", 6);
if (ret) {
derr << "OSD::mkfs: failed to write ready file: error " << ret << dendl;
goto umount_store;
@@ -757,103 +769,19 @@ int OSD::dump_journal(CephContext *cct, const std::string &dev, const std::strin
return err;
}
-int OSD::write_meta(const std::string &base, const std::string &file,
- const char *val, size_t vallen)
-{
- int ret;
- char fn[PATH_MAX];
- char tmp[PATH_MAX];
- int fd;
-
- // does the file already have correct content?
- char oldval[80];
- ret = read_meta(base, file, oldval, sizeof(oldval));
- if (ret == (int)vallen && memcmp(oldval, val, vallen) == 0)
- return 0; // yes.
-
- snprintf(fn, sizeof(fn), "%s/%s", base.c_str(), file.c_str());
- snprintf(tmp, sizeof(tmp), "%s/%s.tmp", base.c_str(), file.c_str());
- fd = ::open(tmp, O_WRONLY|O_CREAT|O_TRUNC, 0644);
- if (fd < 0) {
- ret = errno;
- derr << "write_meta: error opening '" << tmp << "': "
- << cpp_strerror(ret) << dendl;
- return -ret;
- }
- ret = safe_write(fd, val, vallen);
- if (ret) {
- derr << "write_meta: failed to write to '" << tmp << "': "
- << cpp_strerror(ret) << dendl;
- TEMP_FAILURE_RETRY(::close(fd));
- return ret;
- }
-
- ret = ::fsync(fd);
- TEMP_FAILURE_RETRY(::close(fd));
- if (ret) {
- ::unlink(tmp);
- derr << "write_meta: failed to fsync to '" << tmp << "': "
- << cpp_strerror(ret) << dendl;
- return ret;
- }
- ret = ::rename(tmp, fn);
- if (ret) {
- ::unlink(tmp);
- derr << "write_meta: failed to rename '" << tmp << "' to '" << fn << "': "
- << cpp_strerror(ret) << dendl;
- return ret;
- }
-
- fd = ::open(base.c_str(), O_RDONLY);
- if (fd < 0) {
- ret = errno;
- derr << "write_meta: failed to open dir '" << base << "': "
- << cpp_strerror(ret) << dendl;
- return -ret;
- }
- ::fsync(fd);
- TEMP_FAILURE_RETRY(::close(fd));
-
- return 0;
-}
-
-int OSD::read_meta(const std::string &base, const std::string &file,
- char *val, size_t vallen)
-{
- char fn[PATH_MAX];
- int fd, len;
-
- snprintf(fn, sizeof(fn), "%s/%s", base.c_str(), file.c_str());
- fd = ::open(fn, O_RDONLY);
- if (fd < 0) {
- int err = errno;
- return -err;
- }
- len = safe_read(fd, val, vallen);
- if (len < 0) {
- TEMP_FAILURE_RETRY(::close(fd));
- return len;
- }
- // close sometimes returns errors, but only after write()
- TEMP_FAILURE_RETRY(::close(fd));
-
- val[len] = 0;
- return len;
-}
-
int OSD::write_meta(const std::string &base, uuid_d& cluster_fsid, uuid_d& osd_fsid, int whoami)
{
char val[80];
snprintf(val, sizeof(val), "%s\n", CEPH_OSD_ONDISK_MAGIC);
- write_meta(base, "magic", val, strlen(val));
+ safe_write_file(base.c_str(), "magic", val, strlen(val));
snprintf(val, sizeof(val), "%d\n", whoami);
- write_meta(base, "whoami", val, strlen(val));
+ safe_write_file(base.c_str(), "whoami", val, strlen(val));
cluster_fsid.print(val);
strcat(val, "\n");
- write_meta(base, "ceph_fsid", val, strlen(val));
+ safe_write_file(base.c_str(), "ceph_fsid", val, strlen(val));
return 0;
}
@@ -863,24 +791,24 @@ int OSD::peek_meta(const std::string &dev, std::string& magic,
{
char val[80] = { 0 };
- if (read_meta(dev, "magic", val, sizeof(val)) < 0)
+ if (safe_read_file(dev.c_str(), "magic", val, sizeof(val)) < 0)
return -errno;
int l = strlen(val);
if (l && val[l-1] == '\n')
val[l-1] = 0;
magic = val;
- if (read_meta(dev, "whoami", val, sizeof(val)) < 0)
+ if (safe_read_file(dev.c_str(), "whoami", val, sizeof(val)) < 0)
return -errno;
whoami = atoi(val);
- if (read_meta(dev, "ceph_fsid", val, sizeof(val)) < 0)
+ if (safe_read_file(dev.c_str(), "ceph_fsid", val, sizeof(val)) < 0)
return -errno;
if (strlen(val) > 36)
val[36] = 0;
cluster_fsid.parse(val);
- if (read_meta(dev, "fsid", val, sizeof(val)) < 0)
+ if (safe_read_file(dev.c_str(), "fsid", val, sizeof(val)) < 0)
osd_fsid = uuid_d();
else {
if (strlen(val) > 36)
@@ -979,6 +907,10 @@ OSD::OSD(CephContext *cct_, int id, Messenger *internal_messenger, Messenger *ex
service(this)
{
monc->set_messenger(client_messenger);
+ op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
+ cct->_conf->osd_op_log_threshold);
+ op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
+ cct->_conf->osd_op_history_duration);
}
OSD::~OSD()
@@ -1139,6 +1071,7 @@ public:
int OSD::init()
{
+ CompatSet initial, diff;
Mutex::Locker lock(osd_lock);
if (is_stopping())
return 0;
@@ -1163,9 +1096,48 @@ int OSD::init()
r = read_superblock();
if (r < 0) {
derr << "OSD::init() : unable to read osd superblock" << dendl;
- store->umount();
- delete store;
- return -EINVAL;
+ r = -EINVAL;
+ goto out;
+ }
+
+ if (osd_compat.compare(superblock.compat_features) < 0) {
+ derr << "The disk uses features unsupported by the executable." << dendl;
+ derr << " ondisk features " << superblock.compat_features << dendl;
+ derr << " daemon features " << osd_compat << dendl;
+
+ if (osd_compat.writeable(superblock.compat_features)) {
+ CompatSet diff = osd_compat.unsupported(superblock.compat_features);
+ derr << "it is still writeable, though. Missing features: " << diff << dendl;
+ r = -EOPNOTSUPP;
+ goto out;
+ }
+ else {
+ CompatSet diff = osd_compat.unsupported(superblock.compat_features);
+ derr << "Cannot write to disk! Missing features: " << diff << dendl;
+ r = -EOPNOTSUPP;
+ goto out;
+ }
+ }
+
+ assert_warn(whoami == superblock.whoami);
+ if (whoami != superblock.whoami) {
+ derr << "OSD::init: superblock says osd"
+ << superblock.whoami << " but i am osd." << whoami << dendl;
+ r = -EINVAL;
+ goto out;
+ }
+
+ initial = get_osd_initial_compat_set();
+ diff = superblock.compat_features.unsupported(initial);
+ if (superblock.compat_features.merge(initial)) {
+ // We need to persist the new compat_set before we
+ // do anything else
+ dout(5) << "Upgrading superblock adding: " << diff << dendl;
+ ObjectStore::Transaction t;
+ write_superblock(t);
+ r = store->apply_transaction(t);
+ if (r < 0)
+ goto out;
}
// make sure info object exists
@@ -1175,7 +1147,7 @@ int OSD::init()
t.touch(coll_t::META_COLL, service.infos_oid);
r = store->apply_transaction(t);
if (r < 0)
- return r;
+ goto out;
}
// make sure snap mapper object exists
@@ -1185,19 +1157,7 @@ int OSD::init()
t.touch(coll_t::META_COLL, OSD::make_snapmapper_oid());
r = store->apply_transaction(t);
if (r < 0)
- return r;
- }
-
- if (osd_compat.compare(superblock.compat_features) != 0) {
- // We need to persist the new compat_set before we
- // do anything else
- dout(5) << "Upgrading superblock compat_set" << dendl;
- superblock.compat_features = osd_compat;
- ObjectStore::Transaction t;
- write_superblock(t);
- r = store->apply_transaction(t);
- if (r < 0)
- return r;
+ goto out;
}
class_handler = new ClassHandler(cct);
@@ -1213,7 +1173,8 @@ int OSD::init()
assert_warn(!osdmap);
if (osdmap) {
derr << "OSD::init: unable to read current osdmap" << dendl;
- return -EINVAL;
+ r = -EINVAL;
+ goto out;
}
osdmap = get_map(superblock.current_epoch);
check_osdmap_features();
@@ -1226,12 +1187,6 @@ int OSD::init()
load_pgs();
dout(2) << "superblock: i am osd." << superblock.whoami << dendl;
- assert_warn(whoami == superblock.whoami);
- if (whoami != superblock.whoami) {
- derr << "OSD::init: logic error: superblock says osd"
- << superblock.whoami << " but i am osd." << whoami << dendl;
- return -EINVAL;
- }
create_logger();
@@ -1248,7 +1203,7 @@ int OSD::init()
monc->set_want_keys(CEPH_ENTITY_TYPE_MON | CEPH_ENTITY_TYPE_OSD);
r = monc->init();
if (r < 0)
- return r;
+ goto out;
// tell monc about log_client so it will know about mon session resets
monc->set_log_client(&clog);
@@ -1272,12 +1227,10 @@ int OSD::init()
r = monc->authenticate();
if (r < 0) {
- monc->shutdown();
- store->umount();
osd_lock.Lock(); // locker is going to unlock this on function exit
if (is_stopping())
- return 0;
- return r;
+ r = 0;
+ goto monout;
}
while (monc->wait_auth_rotating(30.0) < 0) {
@@ -1297,6 +1250,13 @@ int OSD::init()
start_boot();
return 0;
+monout:
+ monc->shutdown();
+
+out:
+ store->umount();
+ delete store;
+ return r;
}
void OSD::final_init()
@@ -1715,28 +1675,6 @@ int OSD::read_superblock()
::decode(superblock, p);
dout(10) << "read_superblock " << superblock << dendl;
- if (osd_compat.compare(superblock.compat_features) < 0) {
- derr << "The disk uses features unsupported by the executable." << dendl;
- derr << " ondisk features " << superblock.compat_features << dendl;
- derr << " daemon features " << osd_compat << dendl;
-
- if (osd_compat.writeable(superblock.compat_features)) {
- derr << "it is still writeable, though. Missing features:" << dendl;
- CompatSet diff = osd_compat.unsupported(superblock.compat_features);
- return -EOPNOTSUPP;
- }
- else {
- derr << "Cannot write to disk! Missing features:" << dendl;
- CompatSet diff = osd_compat.unsupported(superblock.compat_features);
- return -EOPNOTSUPP;
- }
- }
-
- if (whoami != superblock.whoami) {
- derr << "read_superblock superblock says osd." << superblock.whoami
- << ", but i (think i) am osd." << whoami << dendl;
- return -1;
- }
return 0;
}
@@ -1751,17 +1689,17 @@ void OSD::recursive_remove_collection(ObjectStore *store, coll_t tmp)
make_snapmapper_oid());
SnapMapper mapper(&driver, 0, 0, 0);
- vector<hobject_t> objects;
+ vector<ghobject_t> objects;
store->collection_list(tmp, objects);
// delete them.
ObjectStore::Transaction t;
unsigned removed = 0;
- for (vector<hobject_t>::iterator p = objects.begin();
+ for (vector<ghobject_t>::iterator p = objects.begin();
p != objects.end();
++p, removed++) {
OSDriver::OSTransaction _t(driver.get_transaction(&t));
- int r = mapper.remove_oid(*p, &_t);
+ int r = mapper.remove_oid(p->hobj, &_t);
if (r != 0 && r != -ENOENT)
assert(0);
t.collection_remove(tmp, *p);
@@ -3342,10 +3280,10 @@ bool remove_dir(
ObjectStore::Sequencer *osr,
coll_t coll, DeletingStateRef dstate)
{
- vector<hobject_t> olist;
+ vector<ghobject_t> olist;
int64_t num = 0;
ObjectStore::Transaction *t = new ObjectStore::Transaction;
- hobject_t next;
+ ghobject_t next;
while (!next.is_max()) {
store->collection_list_partial(
coll,
@@ -3355,11 +3293,11 @@ bool remove_dir(
0,
&olist,
&next);
- for (vector<hobject_t>::iterator i = olist.begin();
+ for (vector<ghobject_t>::iterator i = olist.begin();
i != olist.end();
++i, ++num) {
OSDriver::OSTransaction _t(osdriver->get_transaction(t));
- int r = mapper->remove_oid(*i, &_t);
+ int r = mapper->remove_oid(i->hobj, &_t);
if (r != 0 && r != -ENOENT) {
assert(0);
}
@@ -3402,16 +3340,16 @@ void OSD::RemoveWQ::_process(pair<PGRef, DeletingStateRef> item)
if (!item.second->start_clearing())
return;
- if (pg->have_temp_coll()) {
+ list<coll_t> colls_to_remove;
+ pg->get_colls(&colls_to_remove);
+ for (list<coll_t>::iterator i = colls_to_remove.begin();
+ i != colls_to_remove.end();
+ ++i) {
bool cont = remove_dir(
- pg->cct, store, &mapper, &driver, pg->osr.get(), pg->get_temp_coll(), item.second);
+ pg->cct, store, &mapper, &driver, pg->osr.get(), *i, item.second);
if (!cont)
return;
}
- bool cont = remove_dir(
- pg->cct, store, &mapper, &driver, pg->osr.get(), coll, item.second);
- if (!cont)
- return;
if (!item.second->start_deleting())
return;
@@ -3422,9 +3360,12 @@ void OSD::RemoveWQ::_process(pair<PGRef, DeletingStateRef> item)
OSD::make_infos_oid(),
pg->log_oid,
t);
- if (pg->have_temp_coll())
- t->remove_collection(pg->get_temp_coll());
- t->remove_collection(coll);
+
+ for (list<coll_t>::iterator i = colls_to_remove.begin();
+ i != colls_to_remove.end();
+ ++i) {
+ t->remove_collection(*i);
+ }
// We need the sequencer to stick around until the op is complete
store->queue_transaction(
@@ -4602,7 +4543,7 @@ void OSD::do_waiters()
void OSD::dispatch_op(OpRequestRef op)
{
- switch (op->request->get_type()) {
+ switch (op->get_req()->get_type()) {
case MSG_OSD_PG_CREATE:
handle_pg_create(op);
@@ -4728,7 +4669,7 @@ void OSD::_dispatch(Message *m)
default:
{
- OpRequestRef op = op_tracker.create_request(m);
+ OpRequestRef op = op_tracker.create_request<OpRequest>(m);
op->mark_event("waiting_for_osdmap");
// no map? starting up?
if (!osdmap) {
@@ -5774,9 +5715,9 @@ bool OSD::require_mon_peer(Message *m)
bool OSD::require_osd_peer(OpRequestRef op)
{
- if (!op->request->get_connection()->peer_is_osd()) {
- dout(0) << "require_osd_peer received from non-osd " << op->request->get_connection()->get_peer_addr()
- << " " << *op->request << dendl;
+ if (!op->get_req()->get_connection()->peer_is_osd()) {
+ dout(0) << "require_osd_peer received from non-osd " << op->get_req()->get_connection()->get_peer_addr()
+ << " " << *op->get_req() << dendl;
return false;
}
return true;
@@ -5788,7 +5729,7 @@ bool OSD::require_osd_peer(OpRequestRef op)
*/
bool OSD::require_same_or_newer_map(OpRequestRef op, epoch_t epoch)
{
- Message *m = op->request;
+ Message *m = op->get_req();
dout(15) << "require_same_or_newer_map " << epoch << " (i am " << osdmap->get_epoch() << ") " << m << dendl;
assert(osd_lock.is_locked());
@@ -5879,22 +5820,11 @@ void OSD::split_pgs(
dout(10) << "m_seed " << i->ps() << dendl;
dout(10) << "split_bits is " << split_bits << dendl;
- rctx->transaction->create_collection(
- coll_t(*i));
- rctx->transaction->split_collection(
- coll_t(parent->info.pgid),
+ parent->split_colls(
+ *i,
split_bits,
i->m_seed,
- coll_t(*i));
- if (parent->have_temp_coll()) {
- rctx->transaction->create_collection(
- coll_t::make_temp_coll(*i));
- rctx->transaction->split_collection(
- coll_t::make_temp_coll(parent->info.pgid),
- split_bits,
- i->m_seed,
- coll_t::make_temp_coll(*i));
- }
+ rctx->transaction);
parent->split_into(
*i,
child,
@@ -5911,7 +5841,7 @@ void OSD::split_pgs(
*/
void OSD::handle_pg_create(OpRequestRef op)
{
- MOSDPGCreate *m = (MOSDPGCreate*)op->request;
+ MOSDPGCreate *m = (MOSDPGCreate*)op->get_req();
assert(m->get_header().type == MSG_OSD_PG_CREATE);
dout(10) << "handle_pg_create " << *m << dendl;
@@ -5931,11 +5861,16 @@ void OSD::handle_pg_create(OpRequestRef op)
}
}
- if (!require_mon_peer(op->request)) {
- // we have to hack around require_mon_peer's interface limits
- op->request = NULL;
+ /* we have to hack around require_mon_peer's interface limits, so
+ * grab an extra reference before going in. If the peer isn't
+ * a Monitor, the reference is put for us (and then cleared
+ * up automatically by our OpTracker infrastructure). Otherwise,
+ * we put the extra ref ourself.
+ */
+ if (!require_mon_peer(op->get_req()->get())) {
return;
}
+ op->get_req()->put();
if (!require_same_or_newer_map(op, m->epoch)) return;
@@ -6240,7 +6175,7 @@ void OSD::do_infos(map<int,vector<pair<pg_notify_t, pg_interval_map_t> > >& info
*/
void OSD::handle_pg_notify(OpRequestRef op)
{
- MOSDPGNotify *m = (MOSDPGNotify*)op->request;
+ MOSDPGNotify *m = (MOSDPGNotify*)op->get_req();
assert(m->get_header().type == MSG_OSD_PG_NOTIFY);
dout(7) << "handle_pg_notify from " << m->get_source() << dendl;
@@ -6275,7 +6210,7 @@ void OSD::handle_pg_notify(OpRequestRef op)
void OSD::handle_pg_log(OpRequestRef op)
{
- MOSDPGLog *m = (MOSDPGLog*) op->request;
+ MOSDPGLog *m = (MOSDPGLog*) op->get_req();
assert(m->get_header().type == MSG_OSD_PG_LOG);
dout(7) << "handle_pg_log " << *m << " from " << m->get_source() << dendl;
@@ -6303,7 +6238,7 @@ void OSD::handle_pg_log(OpRequestRef op)
void OSD::handle_pg_info(OpRequestRef op)
{
- MOSDPGInfo *m = static_cast<MOSDPGInfo *>(op->request);
+ MOSDPGInfo *m = static_cast<MOSDPGInfo *>(op->get_req());
assert(m->get_header().type == MSG_OSD_PG_INFO);
dout(7) << "handle_pg_info " << *m << " from " << m->get_source() << dendl;
@@ -6336,7 +6271,7 @@ void OSD::handle_pg_info(OpRequestRef op)
void OSD::handle_pg_trim(OpRequestRef op)
{
- MOSDPGTrim *m = (MOSDPGTrim *)op->request;
+ MOSDPGTrim *m = (MOSDPGTrim *)op->get_req();
assert(m->get_header().type == MSG_OSD_PG_TRIM);
dout(7) << "handle_pg_trim " << *m << " from " << m->get_source() << dendl;
@@ -6389,7 +6324,7 @@ void OSD::handle_pg_trim(OpRequestRef op)
void OSD::handle_pg_scan(OpRequestRef op)
{
- MOSDPGScan *m = static_cast<MOSDPGScan*>(op->request);
+ MOSDPGScan *m = static_cast<MOSDPGScan*>(op->get_req());
assert(m->get_header().type == MSG_OSD_PG_SCAN);
dout(10) << "handle_pg_scan " << *m << " from " << m->get_source() << dendl;
@@ -6417,7 +6352,7 @@ void OSD::handle_pg_scan(OpRequestRef op)
void OSD::handle_pg_backfill(OpRequestRef op)
{
- MOSDPGBackfill *m = static_cast<MOSDPGBackfill*>(op->request);
+ MOSDPGBackfill *m = static_cast<MOSDPGBackfill*>(op->get_req());
assert(m->get_header().type == MSG_OSD_PG_BACKFILL);
dout(10) << "handle_pg_backfill " << *m << " from " << m->get_source() << dendl;
@@ -6445,7 +6380,7 @@ void OSD::handle_pg_backfill(OpRequestRef op)
void OSD::handle_pg_backfill_reserve(OpRequestRef op)
{
- MBackfillReserve *m = static_cast<MBackfillReserve*>(op->request);
+ MBackfillReserve *m = static_cast<MBackfillReserve*>(op->get_req());
assert(m->get_header().type == MSG_OSD_BACKFILL_RESERVE);
if (!require_osd_peer(op))
@@ -6489,7 +6424,7 @@ void OSD::handle_pg_backfill_reserve(OpRequestRef op)
void OSD::handle_pg_recovery_reserve(OpRequestRef op)
{
- MRecoveryReserve *m = static_cast<MRecoveryReserve*>(op->request);
+ MRecoveryReserve *m = static_cast<MRecoveryReserve*>(op->get_req());
assert(m->get_header().type == MSG_OSD_RECOVERY_RESERVE);
if (!require_osd_peer(op))
@@ -6541,7 +6476,7 @@ void OSD::handle_pg_query(OpRequestRef op)
{
assert(osd_lock.is_locked());
- MOSDPGQuery *m = (MOSDPGQuery*)op->request;
+ MOSDPGQuery *m = (MOSDPGQuery*)op->get_req();
assert(m->get_header().type == MSG_OSD_PG_QUERY);
if (!require_osd_peer(op))
@@ -6628,7 +6563,7 @@ void OSD::handle_pg_query(OpRequestRef op)
void OSD::handle_pg_remove(OpRequestRef op)
{
- MOSDPGRemove *m = (MOSDPGRemove *)op->request;
+ MOSDPGRemove *m = (MOSDPGRemove *)op->get_req();
assert(m->get_header().type == MSG_OSD_PG_REMOVE);
assert(osd_lock.is_locked());
@@ -6901,7 +6836,7 @@ void OSDService::reply_op_error(OpRequestRef op, int err)
void OSDService::reply_op_error(OpRequestRef op, int err, eversion_t v,
version_t uv)
{
- MOSDOp *m = static_cast<MOSDOp*>(op->request);
+ MOSDOp *m = static_cast<MOSDOp*>(op->get_req());
assert(m->get_header().type == CEPH_MSG_OSD_OP);
int flags;
flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK);
@@ -6913,7 +6848,7 @@ void OSDService::reply_op_error(OpRequestRef op, int err, eversion_t v,
void OSDService::handle_misdirected_op(PG *pg, OpRequestRef op)
{
- MOSDOp *m = static_cast<MOSDOp*>(op->request);
+ MOSDOp *m = static_cast<MOSDOp*>(op->get_req());
assert(m->get_header().type == CEPH_MSG_OSD_OP);
if (m->get_map_epoch() < pg->info.history.same_primary_since) {
@@ -6932,7 +6867,7 @@ void OSDService::handle_misdirected_op(PG *pg, OpRequestRef op)
void OSD::handle_op(OpRequestRef op)
{
- MOSDOp *m = static_cast<MOSDOp*>(op->request);
+ MOSDOp *m = static_cast<MOSDOp*>(op->get_req());
assert(m->get_header().type == CEPH_MSG_OSD_OP);
if (op_is_discardable(m)) {
dout(10) << " discardable " << *m << dendl;
@@ -7067,7 +7002,7 @@ void OSD::handle_op(OpRequestRef op)
template<typename T, int MSGTYPE>
void OSD::handle_replica_op(OpRequestRef op)
{
- T *m = static_cast<T *>(op->request);
+ T *m = static_cast<T *>(op->get_req());
assert(m->get_header().type == MSGTYPE);
dout(10) << __func__ << *m << " epoch " << m->map_epoch << dendl;
@@ -7121,24 +7056,24 @@ bool OSD::op_is_discardable(MOSDOp *op)
*/
void OSD::enqueue_op(PG *pg, OpRequestRef op)
{
- utime_t latency = ceph_clock_now(cct) - op->request->get_recv_stamp();
- dout(15) << "enqueue_op " << op << " prio " << op->request->get_priority()
- << " cost " << op->request->get_cost()
+ utime_t latency = ceph_clock_now(cct) - op->get_req()->get_recv_stamp();
+ dout(15) << "enqueue_op " << op << " prio " << op->get_req()->get_priority()
+ << " cost " << op->get_req()->get_cost()
<< " latency " << latency
- << " " << *(op->request) << dendl;
+ << " " << *(op->get_req()) << dendl;
pg->queue_op(op);
}
void OSD::OpWQ::_enqueue(pair<PGRef, OpRequestRef> item)
{
- unsigned priority = item.second->request->get_priority();
- unsigned cost = item.second->request->get_cost();
+ unsigned priority = item.second->get_req()->get_priority();
+ unsigned cost = item.second->get_req()->get_cost();
if (priority >= CEPH_MSG_PRIO_LOW)
pqueue.enqueue_strict(
- item.second->request->get_source_inst(),
+ item.second->get_req()->get_source_inst(),
priority, item);
else
- pqueue.enqueue(item.second->request->get_source_inst(),
+ pqueue.enqueue(item.second->get_req()->get_source_inst(),
priority, cost, item);
osd->logger->set(l_osd_opq, pqueue.length());
}
@@ -7153,14 +7088,14 @@ void OSD::OpWQ::_enqueue_front(pair<PGRef, OpRequestRef> item)
pg_for_processing[&*(item.first)].pop_back();
}
}
- unsigned priority = item.second->request->get_priority();
- unsigned cost = item.second->request->get_cost();
+ unsigned priority = item.second->get_req()->get_priority();
+ unsigned cost = item.second->get_req()->get_cost();
if (priority >= CEPH_MSG_PRIO_LOW)
pqueue.enqueue_strict_front(
- item.second->request->get_source_inst(),
+ item.second->get_req()->get_source_inst(),
priority, item);
else
- pqueue.enqueue_front(item.second->request->get_source_inst(),
+ pqueue.enqueue_front(item.second->get_req()->get_source_inst(),
priority, cost, item);
osd->logger->set(l_osd_opq, pqueue.length());
}
@@ -7212,11 +7147,11 @@ void OSD::dequeue_op(
PGRef pg, OpRequestRef op,
ThreadPool::TPHandle &handle)
{
- utime_t latency = ceph_clock_now(cct) - op->request->get_recv_stamp();
- dout(10) << "dequeue_op " << op << " prio " << op->request->get_priority()
- << " cost " << op->request->get_cost()
+ utime_t latency = ceph_clock_now(cct) - op->get_req()->get_recv_stamp();
+ dout(10) << "dequeue_op " << op << " prio " << op->get_req()->get_priority()
+ << " cost " << op->get_req()->get_cost()
<< " latency " << latency
- << " " << *(op->request)
+ << " " << *(op->get_req())
<< " pg " << *pg << dendl;
if (pg->deleting)
return;
@@ -7317,6 +7252,8 @@ const char** OSD::get_tracked_conf_keys() const
{
static const char* KEYS[] = {
"osd_max_backfills",
+ "osd_op_complaint_time", "osd_op_log_threshold",
+ "osd_op_history_size", "osd_op_history_duration",
NULL
};
return KEYS;
@@ -7329,13 +7266,23 @@ void OSD::handle_conf_change(const struct md_config_t *conf,
service.local_reserver.set_max(cct->_conf->osd_max_backfills);
service.remote_reserver.set_max(cct->_conf->osd_max_backfills);
}
+ if (changed.count("osd_op_complaint_time") ||
+ changed.count("osd_op_log_threshold")) {
+ op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
+ cct->_conf->osd_op_log_threshold);
+ }
+ if (changed.count("osd_op_history_size") ||
+ changed.count("osd_op_history_duration")) {
+ op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
+ cct->_conf->osd_op_history_duration);
+ }
}
// --------------------------------
int OSD::init_op_flags(OpRequestRef op)
{
- MOSDOp *m = static_cast<MOSDOp*>(op->request);
+ MOSDOp *m = static_cast<MOSDOp*>(op->get_req());
vector<OSDOp>::iterator iter;
// client flags have no bearing on whether an op is a read, write, etc.
diff --git a/src/osd/OSD.h b/src/osd/OSD.h
index c2f45196870..9346cee6890 100644
--- a/src/osd/OSD.h
+++ b/src/osd/OSD.h
@@ -307,6 +307,7 @@ public:
ThreadPool::WorkQueue<PG> &scrub_wq;
ThreadPool::WorkQueue<PG> &scrub_finalize_wq;
ThreadPool::WorkQueue<MOSDRepScrub> &rep_scrub_wq;
+ GenContextWQ push_wq;
ClassHandler *&class_handler;
void dequeue_pg(PG *pg, list<OpRequestRef> *dequeued);
@@ -635,6 +636,20 @@ public:
OSDService(OSD *osd);
~OSDService();
};
+
+struct C_OSD_SendMessageOnConn: public Context {
+ OSDService *osd;
+ Message *reply;
+ ConnectionRef conn;
+ C_OSD_SendMessageOnConn(
+ OSDService *osd,
+ Message *reply,
+ ConnectionRef conn) : osd(osd), reply(reply), conn(conn) {}
+ void finish(int) {
+ osd->send_message_osd_cluster(reply, conn.get());
+ }
+};
+
class OSD : public Dispatcher,
public md_config_obs_t {
/** OSD **/
@@ -731,6 +746,25 @@ public:
return oid;
}
static void recursive_remove_collection(ObjectStore *store, coll_t tmp);
+
+ /**
+ * get_osd_initial_compat_set()
+ *
+ * Get the initial feature set for this OSD. Features
+ * here are automatically upgraded.
+ *
+ * Return value: Initial osd CompatSet
+ */
+ static CompatSet get_osd_initial_compat_set();
+
+ /**
+ * get_osd_compat_set()
+ *
+ * Get all features supported by this OSD
+ *
+ * Return value: CompatSet of all supported features
+ */
+ static CompatSet get_osd_compat_set();
private:
@@ -1704,10 +1738,6 @@ protected:
}
private:
- static int write_meta(const std::string &base, const std::string &file,
- const char *val, size_t vallen);
- static int read_meta(const std::string &base, const std::string &file,
- char *val, size_t vallen);
static int write_meta(const std::string &base,
uuid_d& cluster_fsid, uuid_d& osd_fsid, int whoami);
public:
diff --git a/src/osd/OpRequest.cc b/src/osd/OpRequest.cc
index 1ffe3073051..2ed7a23086f 100644
--- a/src/osd/OpRequest.cc
+++ b/src/osd/OpRequest.cc
@@ -11,229 +11,21 @@
#include "messages/MOSDSubOp.h"
#include "include/assert.h"
-#define dout_subsys ceph_subsys_optracker
-#undef dout_prefix
-#define dout_prefix _prefix(_dout)
-static ostream& _prefix(std::ostream* _dout)
-{
- return *_dout << "--OSD::tracker-- ";
-}
OpRequest::OpRequest(Message *req, OpTracker *tracker) :
- request(req), xitem(this),
+ TrackedOp(req, tracker),
rmw_flags(0),
- warn_interval_multiplier(1),
- lock("OpRequest::lock"),
- tracker(tracker),
- hit_flag_points(0), latest_flag_point(0),
- seq(0) {
- received_time = request->get_recv_stamp();
- tracker->register_inflight_op(&xitem);
+ hit_flag_points(0), latest_flag_point(0) {
if (req->get_priority() < tracker->cct->_conf->osd_client_op_priority) {
// don't warn as quickly for low priority ops
warn_interval_multiplier = tracker->cct->_conf->osd_recovery_op_warn_multiple;
}
}
-void OpHistory::on_shutdown()
-{
- arrived.clear();
- duration.clear();
- shutdown = true;
-}
-
-void OpHistory::insert(utime_t now, OpRequestRef op)
-{
- if (shutdown)
- return;
- duration.insert(make_pair(op->get_duration(), op));
- arrived.insert(make_pair(op->get_arrived(), op));
- cleanup(now);
-}
-
-void OpHistory::cleanup(utime_t now)
-{
- while (arrived.size() &&
- (now - arrived.begin()->first >
- (double)(tracker->cct->_conf->osd_op_history_duration))) {
- duration.erase(make_pair(
- arrived.begin()->second->get_duration(),
- arrived.begin()->second));
- arrived.erase(arrived.begin());
- }
-
- while (duration.size() > tracker->cct->_conf->osd_op_history_size) {
- arrived.erase(make_pair(
- duration.begin()->second->get_arrived(),
- duration.begin()->second));
- duration.erase(duration.begin());
- }
-}
-
-void OpHistory::dump_ops(utime_t now, Formatter *f)
-{
- cleanup(now);
- f->open_object_section("OpHistory");
- f->dump_int("num to keep", tracker->cct->_conf->osd_op_history_size);
- f->dump_int("duration to keep", tracker->cct->_conf->osd_op_history_duration);
- {
- f->open_array_section("Ops");
- for (set<pair<utime_t, OpRequestRef> >::const_iterator i =
- arrived.begin();
- i != arrived.end();
- ++i) {
- f->open_object_section("Op");
- i->second->dump(now, f);
- f->close_section();
- }
- f->close_section();
- }
- f->close_section();
-}
-
-void OpTracker::dump_historic_ops(Formatter *f)
-{
- Mutex::Locker locker(ops_in_flight_lock);
- utime_t now = ceph_clock_now(cct);
- history.dump_ops(now, f);
-}
-
-void OpTracker::dump_ops_in_flight(Formatter *f)
-{
- Mutex::Locker locker(ops_in_flight_lock);
- f->open_object_section("ops_in_flight"); // overall dump
- f->dump_int("num_ops", ops_in_flight.size());
- f->open_array_section("ops"); // list of OpRequests
- utime_t now = ceph_clock_now(cct);
- for (xlist<OpRequest*>::iterator p = ops_in_flight.begin(); !p.end(); ++p) {
- f->open_object_section("op");
- (*p)->dump(now, f);
- f->close_section(); // this OpRequest
- }
- f->close_section(); // list of OpRequests
- f->close_section(); // overall dump
-}
-
-void OpTracker::register_inflight_op(xlist<OpRequest*>::item *i)
-{
- Mutex::Locker locker(ops_in_flight_lock);
- ops_in_flight.push_back(i);
- ops_in_flight.back()->seq = seq++;
-}
-
-void OpTracker::unregister_inflight_op(OpRequest *i)
-{
- Mutex::Locker locker(ops_in_flight_lock);
- assert(i->xitem.get_list() == &ops_in_flight);
- utime_t now = ceph_clock_now(cct);
- i->xitem.remove_myself();
- i->request->clear_data();
- history.insert(now, OpRequestRef(i));
-}
-
-bool OpTracker::check_ops_in_flight(std::vector<string> &warning_vector)
-{
- Mutex::Locker locker(ops_in_flight_lock);
- if (!ops_in_flight.size())
- return false;
-
- utime_t now = ceph_clock_now(cct);
- utime_t too_old = now;
- too_old -= cct->_conf->osd_op_complaint_time;
-
- utime_t oldest_secs = now - ops_in_flight.front()->received_time;
-
- dout(10) << "ops_in_flight.size: " << ops_in_flight.size()
- << "; oldest is " << oldest_secs
- << " seconds old" << dendl;
-
- if (oldest_secs < cct->_conf->osd_op_complaint_time)
- return false;
-
- xlist<OpRequest*>::iterator i = ops_in_flight.begin();
- warning_vector.reserve(cct->_conf->osd_op_log_threshold + 1);
-
- int slow = 0; // total slow
- int warned = 0; // total logged
- while (!i.end() && (*i)->received_time < too_old) {
- slow++;
-
- // exponential backoff of warning intervals
- if (((*i)->received_time +
- (cct->_conf->osd_op_complaint_time *
- (*i)->warn_interval_multiplier)) < now) {
- // will warn
- if (warning_vector.empty())
- warning_vector.push_back("");
- warned++;
- if (warned > cct->_conf->osd_op_log_threshold)
- break;
-
- utime_t age = now - (*i)->received_time;
- stringstream ss;
- ss << "slow request " << age << " seconds old, received at " << (*i)->received_time
- << ": " << *((*i)->request) << " currently "
- << ((*i)->current.size() ? (*i)->current : (*i)->state_string());
- warning_vector.push_back(ss.str());
-
- // only those that have been shown will backoff
- (*i)->warn_interval_multiplier *= 2;
- }
- ++i;
- }
-
- // only summarize if we warn about any. if everything has backed
- // off, we will stay silent.
- if (warned > 0) {
- stringstream ss;
- ss << slow << " slow requests, " << warned << " included below; oldest blocked for > "
- << oldest_secs << " secs";
- warning_vector[0] = ss.str();
- }
-
- return warning_vector.size();
-}
-
-void OpTracker::get_age_ms_histogram(pow2_hist_t *h)
-{
- Mutex::Locker locker(ops_in_flight_lock);
-
- h->clear();
-
- utime_t now = ceph_clock_now(NULL);
- unsigned bin = 30;
- uint32_t lb = 1 << (bin-1); // lower bound for this bin
- int count = 0;
- for (xlist<OpRequest*>::iterator i = ops_in_flight.begin(); !i.end(); ++i) {
- utime_t age = now - (*i)->received_time;
- uint32_t ms = (long)(age * 1000.0);
- if (ms >= lb) {
- count++;
- continue;
- }
- if (count)
- h->set(bin, count);
- while (lb > ms) {
- bin--;
- lb >>= 1;
- }
- count = 1;
- }
- if (count)
- h->set(bin, count);
-}
-
-void OpRequest::dump(utime_t now, Formatter *f) const
+void OpRequest::_dump(utime_t now, Formatter *f) const
{
Message *m = request;
- stringstream name;
- m->print(name);
- f->dump_string("description", name.str().c_str()); // this OpRequest
- f->dump_unsigned("rmw_flags", rmw_flags);
- f->dump_stream("received_at") << received_time;
- f->dump_float("age", now - received_time);
- f->dump_float("duration", get_duration());
f->dump_string("flag_point", state_string());
if (m->get_orig_source().is_client()) {
f->open_object_section("client_info");
@@ -257,50 +49,11 @@ void OpRequest::dump(utime_t now, Formatter *f) const
}
}
-void OpTracker::mark_event(OpRequest *op, const string &dest)
-{
- utime_t now = ceph_clock_now(cct);
- return _mark_event(op, dest, now);
-}
-
-void OpTracker::_mark_event(OpRequest *op, const string &evt,
- utime_t time)
-{
- Mutex::Locker locker(ops_in_flight_lock);
- dout(5) << "reqid: " << op->get_reqid() << ", seq: " << op->seq
- << ", time: " << time << ", event: " << evt
- << ", request: " << *op->request << dendl;
-}
-
-void OpTracker::RemoveOnDelete::operator()(OpRequest *op) {
- op->mark_event("done");
- tracker->unregister_inflight_op(op);
- // Do not delete op, unregister_inflight_op took control
-}
-
-OpRequestRef OpTracker::create_request(Message *ref)
-{
- OpRequestRef retval(new OpRequest(ref, this),
- RemoveOnDelete(this));
-
- if (ref->get_type() == CEPH_MSG_OSD_OP) {
- retval->reqid = static_cast<MOSDOp*>(ref)->get_reqid();
- } else if (ref->get_type() == MSG_OSD_SUBOP) {
- retval->reqid = static_cast<MOSDSubOp*>(ref)->reqid;
- }
- _mark_event(retval.get(), "header_read", ref->get_recv_stamp());
- _mark_event(retval.get(), "throttled", ref->get_throttle_stamp());
- _mark_event(retval.get(), "all_read", ref->get_recv_complete_stamp());
- _mark_event(retval.get(), "dispatched", ref->get_dispatch_stamp());
- return retval;
-}
-
-void OpRequest::mark_event(const string &event)
+void OpRequest::init_from_message()
{
- utime_t now = ceph_clock_now(tracker->cct);
- {
- Mutex::Locker l(lock);
- events.push_back(make_pair(now, event));
+ if (request->get_type() == CEPH_MSG_OSD_OP) {
+ reqid = static_cast<MOSDOp*>(request)->get_reqid();
+ } else if (request->get_type() == MSG_OSD_SUBOP) {
+ reqid = static_cast<MOSDSubOp*>(request)->reqid;
}
- tracker->mark_event(this, event);
}
diff --git a/src/osd/OpRequest.h b/src/osd/OpRequest.h
index 50ade71a1b9..87571f58787 100644
--- a/src/osd/OpRequest.h
+++ b/src/osd/OpRequest.h
@@ -25,87 +25,12 @@
#include "common/TrackedOp.h"
#include "osd/osd_types.h"
-struct OpRequest;
-class OpTracker;
-typedef std::tr1::shared_ptr<OpRequest> OpRequestRef;
-class OpHistory {
- set<pair<utime_t, OpRequestRef> > arrived;
- set<pair<double, OpRequestRef> > duration;
- void cleanup(utime_t now);
- bool shutdown;
- OpTracker *tracker;
-
-public:
- OpHistory(OpTracker *tracker_) : shutdown(false), tracker(tracker_) {}
- ~OpHistory() {
- assert(arrived.empty());
- assert(duration.empty());
- }
- void insert(utime_t now, OpRequestRef op);
- void dump_ops(utime_t now, Formatter *f);
- void on_shutdown();
-};
-
-class OpTracker {
- class RemoveOnDelete {
- OpTracker *tracker;
- public:
- RemoveOnDelete(OpTracker *tracker) : tracker(tracker) {}
- void operator()(OpRequest *op);
- };
- friend class RemoveOnDelete;
- friend class OpRequest;
- friend class OpHistory;
- uint64_t seq;
- Mutex ops_in_flight_lock;
- xlist<OpRequest *> ops_in_flight;
- OpHistory history;
-
-protected:
- CephContext *cct;
-
-public:
- OpTracker(CephContext *cct_) : seq(0), ops_in_flight_lock("OpTracker mutex"), history(this), cct(cct_) {}
- void dump_ops_in_flight(Formatter *f);
- void dump_historic_ops(Formatter *f);
- void register_inflight_op(xlist<OpRequest*>::item *i);
- void unregister_inflight_op(OpRequest *i);
-
- void get_age_ms_histogram(pow2_hist_t *h);
-
- /**
- * Look for Ops which are too old, and insert warning
- * strings for each Op that is too old.
- *
- * @param warning_strings A vector<string> reference which is filled
- * with a warning string for each old Op.
- * @return True if there are any Ops to warn on, false otherwise.
- */
- bool check_ops_in_flight(std::vector<string> &warning_strings);
- void mark_event(OpRequest *op, const string &evt);
- void _mark_event(OpRequest *op, const string &evt, utime_t now);
- OpRequestRef create_request(Message *req);
- void on_shutdown() {
- Mutex::Locker l(ops_in_flight_lock);
- history.on_shutdown();
- }
- ~OpTracker() {
- assert(ops_in_flight.empty());
- }
-};
-
/**
* The OpRequest takes in a Message* and takes over a single reference
* to it, which it puts() when destroyed.
- * OpRequest is itself ref-counted. The expectation is that you get a Message
- * you want to track, create an OpRequest with it, and then pass around that OpRequest
- * the way you used to pass around the Message.
*/
struct OpRequest : public TrackedOp {
friend class OpTracker;
- friend class OpHistory;
- Message *request;
- xlist<OpRequest*>::item xitem;
// rmw flags
int rmw_flags;
@@ -134,28 +59,12 @@ struct OpRequest : public TrackedOp {
void set_class_write() { rmw_flags |= CEPH_OSD_RMW_FLAG_CLASS_WRITE; }
void set_pg_op() { rmw_flags |= CEPH_OSD_RMW_FLAG_PGOP; }
- utime_t received_time;
- uint8_t warn_interval_multiplier;
- utime_t get_arrived() const {
- return received_time;
- }
- double get_duration() const {
- return events.size() ?
- (events.rbegin()->first - received_time) :
- 0.0;
- }
-
- void dump(utime_t now, Formatter *f) const;
+ void _dump(utime_t now, Formatter *f) const;
private:
- list<pair<utime_t, string> > events;
- string current;
- Mutex lock;
- OpTracker *tracker;
osd_reqid_t reqid;
uint8_t hit_flag_points;
uint8_t latest_flag_point;
- uint64_t seq;
static const uint8_t flag_queued_for_pg=1 << 0;
static const uint8_t flag_reached_pg = 1 << 1;
static const uint8_t flag_delayed = 1 << 2;
@@ -164,12 +73,8 @@ private:
static const uint8_t flag_commit_sent = 1 << 5;
OpRequest(Message *req, OpTracker *tracker);
-public:
- ~OpRequest() {
- assert(request);
- request->put();
- }
+public:
bool been_queued_for_pg() { return hit_flag_points & flag_queued_for_pg; }
bool been_reached_pg() { return hit_flag_points & flag_reached_pg; }
bool been_delayed() { return hit_flag_points & flag_delayed; }
@@ -233,10 +138,15 @@ public:
latest_flag_point = flag_commit_sent;
}
- void mark_event(const string &event);
osd_reqid_t get_reqid() const {
return reqid;
}
+
+ void init_from_message();
+
+ typedef std::tr1::shared_ptr<OpRequest> Ref;
};
+typedef OpRequest::Ref OpRequestRef;
+
#endif /* OPREQUEST_H_ */
diff --git a/src/osd/PG.cc b/src/osd/PG.cc
index f319d160a39..8f7d3ccb684 100644
--- a/src/osd/PG.cc
+++ b/src/osd/PG.cc
@@ -1,4 +1,3 @@
-
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab
/*
@@ -1333,10 +1332,10 @@ void PG::do_pending_flush()
bool PG::op_has_sufficient_caps(OpRequestRef op)
{
// only check MOSDOp
- if (op->request->get_type() != CEPH_MSG_OSD_OP)
+ if (op->get_req()->get_type() != CEPH_MSG_OSD_OP)
return true;
- MOSDOp *req = static_cast<MOSDOp*>(op->request);
+ MOSDOp *req = static_cast<MOSDOp*>(op->get_req());
OSD::Session *session = (OSD::Session *)req->get_connection()->get_priv();
if (!session) {
@@ -1399,76 +1398,6 @@ void PG::queue_op(OpRequestRef op)
osd->op_wq.queue(make_pair(PGRef(this), op));
}
-void PG::do_request(
- OpRequestRef op,
- ThreadPool::TPHandle &handle)
-{
- // do any pending flush
- do_pending_flush();
-
- if (!op_has_sufficient_caps(op)) {
- osd->reply_op_error(op, -EPERM);
- return;
- }
- assert(!op_must_wait_for_map(get_osdmap(), op));
- if (can_discard_request(op)) {
- return;
- }
- if (!flushed) {
- dout(20) << " !flushed, waiting for active on " << op << dendl;
- waiting_for_active.push_back(op);
- return;
- }
-
- switch (op->request->get_type()) {
- case CEPH_MSG_OSD_OP:
- if (is_replay() || !is_active()) {
- dout(20) << " replay, waiting for active on " << op << dendl;
- waiting_for_active.push_back(op);
- return;
- }
- do_op(op); // do it now
- break;
-
- case MSG_OSD_SUBOP:
- do_sub_op(op);
- break;
-
- case MSG_OSD_SUBOPREPLY:
- do_sub_op_reply(op);
- break;
-
- case MSG_OSD_PG_SCAN:
- do_scan(op, handle);
- break;
-
- case MSG_OSD_PG_BACKFILL:
- do_backfill(op);
- break;
-
- case MSG_OSD_PG_PUSH:
- if (!is_active()) {
- waiting_for_active.push_back(op);
- op->mark_delayed("waiting for active");
- return;
- }
- do_push(op);
- break;
-
- case MSG_OSD_PG_PULL:
- do_pull(op);
- break;
-
- case MSG_OSD_PG_PUSH_REPLY:
- do_push_reply(op);
- break;
-
- default:
- assert(0 == "bad message type in do_request");
- }
-}
-
-
void PG::replay_queued_ops()
{
assert(is_replay() && is_active());
@@ -1488,7 +1417,7 @@ void PG::replay_queued_ops()
c = p->first;
}
dout(10) << "activate replay " << p->first << " "
- << *p->second->request << dendl;
+ << *p->second->get_req() << dendl;
replay.push_back(p->second);
}
replay_queue.clear();
@@ -2067,8 +1996,7 @@ void PG::upgrade(ObjectStore *store, const interval_set<snapid_t> &snapcolls)
hobject_t cur;
vector<hobject_t> objects;
while (1) {
- int r = store->collection_list_partial(
- cid,
+ int r = get_pgbackend()->objects_list_partial(
cur,
store->get_ideal_list_min(),
store->get_ideal_list_max(),
@@ -2116,8 +2044,7 @@ void PG::upgrade(ObjectStore *store, const interval_set<snapid_t> &snapcolls)
while (1) {
dout(1) << "Updating snap_mapper from main collection, "
<< done << " objects done" << dendl;
- int r = store->collection_list_partial(
- cid,
+ int r = get_pgbackend()->objects_list_partial(
cur,
store->get_ideal_list_min(),
store->get_ideal_list_max(),
@@ -2140,19 +2067,16 @@ void PG::upgrade(ObjectStore *store, const interval_set<snapid_t> &snapcolls)
++j) {
if (j->snap < CEPH_MAXSNAP) {
OSDriver::OSTransaction _t(osdriver.get_transaction(&t));
- bufferptr bp;
- r = store->getattr(
- cid,
+ bufferlist bl;
+ r = get_pgbackend()->objects_get_attr(
*j,
OI_ATTR,
- bp);
+ &bl);
if (r < 0) {
derr << __func__ << ": getattr returned "
<< cpp_strerror(r) << dendl;
assert(0);
}
- bufferlist bl;
- bl.push_back(bp);
object_info_t oi(bl);
set<snapid_t> oi_snaps(oi.snaps.begin(), oi.snaps.end());
set<snapid_t> cur_snaps;
@@ -2254,7 +2178,8 @@ epoch_t PG::peek_map_epoch(ObjectStore *store, coll_t coll, hobject_t &infos_oid
snapid_t snap;
bool ok = coll.is_pg(pgid, snap);
assert(ok);
- store->collection_getattr(coll, "info", *bl);
+ int r = store->collection_getattr(coll, "info", *bl);
+ assert(r > 0);
bufferlist::iterator bp = bl->begin();
__u8 struct_v = 0;
::decode(struct_v, bp);
@@ -2481,9 +2406,8 @@ void PG::log_weirdness()
<< " log bound mismatch, empty but (" << pg_log.get_tail() << ","
<< pg_log.get_head() << "]\n";
} else {
- if ((pg_log.get_log().log.begin()->version <= pg_log.get_tail()) || // sloppy check
- (pg_log.get_log().log.rbegin()->version != pg_log.get_head() &&
- !(pg_log.get_head() == pg_log.get_tail())))
+ // sloppy check
+ if ((pg_log.get_log().log.begin()->version <= pg_log.get_tail()))
osd->clog.error() << info.pgid
<< " log bound mismatch, info (" << pg_log.get_tail() << ","
<< pg_log.get_head() << "]"
@@ -2694,7 +2618,7 @@ void PG::unreg_next_scrub()
void PG::sub_op_scrub_map(OpRequestRef op)
{
- MOSDSubOp *m = static_cast<MOSDSubOp *>(op->request);
+ MOSDSubOp *m = static_cast<MOSDSubOp *>(op->get_req());
assert(m->get_header().type == MSG_OSD_SUBOP);
dout(7) << "sub_op_scrub_map" << dendl;
@@ -2880,7 +2804,7 @@ void PG::_request_scrub_map(int replica, eversion_t version,
void PG::sub_op_scrub_reserve(OpRequestRef op)
{
- MOSDSubOp *m = static_cast<MOSDSubOp*>(op->request);
+ MOSDSubOp *m = static_cast<MOSDSubOp*>(op->get_req());
assert(m->get_header().type == MSG_OSD_SUBOP);
dout(7) << "sub_op_scrub_reserve" << dendl;
@@ -2900,7 +2824,7 @@ void PG::sub_op_scrub_reserve(OpRequestRef op)
void PG::sub_op_scrub_reserve_reply(OpRequestRef op)
{
- MOSDSubOpReply *reply = static_cast<MOSDSubOpReply*>(op->request);
+ MOSDSubOpReply *reply = static_cast<MOSDSubOpReply*>(op->get_req());
assert(reply->get_header().type == MSG_OSD_SUBOPREPLY);
dout(7) << "sub_op_scrub_reserve_reply" << dendl;
@@ -2933,7 +2857,7 @@ void PG::sub_op_scrub_reserve_reply(OpRequestRef op)
void PG::sub_op_scrub_unreserve(OpRequestRef op)
{
- assert(op->request->get_header().type == MSG_OSD_SUBOP);
+ assert(op->get_req()->get_header().type == MSG_OSD_SUBOP);
dout(7) << "sub_op_scrub_unreserve" << dendl;
op->mark_started();
@@ -2945,7 +2869,7 @@ void PG::sub_op_scrub_stop(OpRequestRef op)
{
op->mark_started();
- MOSDSubOp *m = static_cast<MOSDSubOp*>(op->request);
+ MOSDSubOp *m = static_cast<MOSDSubOp*>(op->get_req());
assert(m->get_header().type == MSG_OSD_SUBOP);
dout(7) << "sub_op_scrub_stop" << dendl;
@@ -3108,9 +3032,9 @@ int PG::build_scrub_map_chunk(
// objects
vector<hobject_t> ls;
- int ret = osd->store->collection_list_range(coll, start, end, 0, &ls);
+ int ret = get_pgbackend()->objects_list_range(start, end, 0, &ls);
if (ret < 0) {
- dout(5) << "collection_list_range error: " << ret << dendl;
+ dout(5) << "objects_list_range error: " << ret << dendl;
return ret;
}
@@ -3630,11 +3554,13 @@ void PG::chunky_scrub(ThreadPool::TPHandle &handle)
hobject_t start = scrubber.start;
while (!boundary_found) {
vector<hobject_t> objects;
- ret = osd->store->collection_list_partial(coll, start,
- cct->_conf->osd_scrub_chunk_min,
- cct->_conf->osd_scrub_chunk_max,
- 0,
- &objects, &scrubber.end);
+ ret = get_pgbackend()->objects_list_partial(
+ start,
+ cct->_conf->osd_scrub_chunk_min,
+ cct->_conf->osd_scrub_chunk_max,
+ 0,
+ &objects,
+ &scrubber.end);
assert(ret >= 0);
// in case we don't find a boundary: start again at the end
@@ -4806,7 +4732,7 @@ ostream& operator<<(ostream& out, const PG& pg)
bool PG::can_discard_op(OpRequestRef op)
{
- MOSDOp *m = static_cast<MOSDOp*>(op->request);
+ MOSDOp *m = static_cast<MOSDOp*>(op->get_req());
if (OSD::op_is_discardable(m)) {
dout(20) << " discard " << *m << dendl;
return true;
@@ -4834,7 +4760,7 @@ bool PG::can_discard_op(OpRequestRef op)
template<typename T, int MSGTYPE>
bool PG::can_discard_replica_op(OpRequestRef op)
{
- T *m = static_cast<T *>(op->request);
+ T *m = static_cast<T *>(op->get_req());
assert(m->get_header().type == MSGTYPE);
// same pg?
@@ -4850,7 +4776,7 @@ bool PG::can_discard_replica_op(OpRequestRef op)
bool PG::can_discard_scan(OpRequestRef op)
{
- MOSDPGScan *m = static_cast<MOSDPGScan *>(op->request);
+ MOSDPGScan *m = static_cast<MOSDPGScan *>(op->get_req());
assert(m->get_header().type == MSG_OSD_PG_SCAN);
if (old_peering_msg(m->map_epoch, m->query_epoch)) {
@@ -4862,7 +4788,7 @@ bool PG::can_discard_scan(OpRequestRef op)
bool PG::can_discard_backfill(OpRequestRef op)
{
- MOSDPGBackfill *m = static_cast<MOSDPGBackfill *>(op->request);
+ MOSDPGBackfill *m = static_cast<MOSDPGBackfill *>(op->get_req());
assert(m->get_header().type == MSG_OSD_PG_BACKFILL);
if (old_peering_msg(m->map_epoch, m->query_epoch)) {
@@ -4876,7 +4802,7 @@ bool PG::can_discard_backfill(OpRequestRef op)
bool PG::can_discard_request(OpRequestRef op)
{
- switch (op->request->get_type()) {
+ switch (op->get_req()->get_type()) {
case CEPH_MSG_OSD_OP:
return can_discard_op(op);
case MSG_OSD_SUBOP:
@@ -4901,55 +4827,55 @@ bool PG::can_discard_request(OpRequestRef op)
bool PG::split_request(OpRequestRef op, unsigned match, unsigned bits)
{
unsigned mask = ~((~0)<<bits);
- switch (op->request->get_type()) {
+ switch (op->get_req()->get_type()) {
case CEPH_MSG_OSD_OP:
- return (static_cast<MOSDOp*>(op->request)->get_pg().m_seed & mask) == match;
+ return (static_cast<MOSDOp*>(op->get_req())->get_pg().m_seed & mask) == match;
}
return false;
}
bool PG::op_must_wait_for_map(OSDMapRef curmap, OpRequestRef op)
{
- switch (op->request->get_type()) {
+ switch (op->get_req()->get_type()) {
case CEPH_MSG_OSD_OP:
return !have_same_or_newer_map(
curmap,
- static_cast<MOSDOp*>(op->request)->get_map_epoch());
+ static_cast<MOSDOp*>(op->get_req())->get_map_epoch());
case MSG_OSD_SUBOP:
return !have_same_or_newer_map(
curmap,
- static_cast<MOSDSubOp*>(op->request)->map_epoch);
+ static_cast<MOSDSubOp*>(op->get_req())->map_epoch);
case MSG_OSD_SUBOPREPLY:
return !have_same_or_newer_map(
curmap,
- static_cast<MOSDSubOpReply*>(op->request)->map_epoch);
+ static_cast<MOSDSubOpReply*>(op->get_req())->map_epoch);
case MSG_OSD_PG_SCAN:
return !have_same_or_newer_map(
curmap,
- static_cast<MOSDPGScan*>(op->request)->map_epoch);
+ static_cast<MOSDPGScan*>(op->get_req())->map_epoch);
case MSG_OSD_PG_BACKFILL:
return !have_same_or_newer_map(
curmap,
- static_cast<MOSDPGBackfill*>(op->request)->map_epoch);
+ static_cast<MOSDPGBackfill*>(op->get_req())->map_epoch);
case MSG_OSD_PG_PUSH:
return !have_same_or_newer_map(
curmap,
- static_cast<MOSDPGPush*>(op->request)->map_epoch);
+ static_cast<MOSDPGPush*>(op->get_req())->map_epoch);
case MSG_OSD_PG_PULL:
return !have_same_or_newer_map(
curmap,
- static_cast<MOSDPGPull*>(op->request)->map_epoch);
+ static_cast<MOSDPGPull*>(op->get_req())->map_epoch);
case MSG_OSD_PG_PUSH_REPLY:
return !have_same_or_newer_map(
curmap,
- static_cast<MOSDPGPushReply*>(op->request)->map_epoch);
+ static_cast<MOSDPGPushReply*>(op->get_req())->map_epoch);
}
assert(0);
return false;
diff --git a/src/osd/PG.h b/src/osd/PG.h
index cdbe827a4a9..275d30c7658 100644
--- a/src/osd/PG.h
+++ b/src/osd/PG.h
@@ -48,6 +48,7 @@
#include "common/WorkQueue.h"
#include "common/ceph_context.h"
#include "include/str_list.h"
+#include "PGBackend.h"
#include <list>
#include <memory>
@@ -193,6 +194,8 @@ protected:
CephContext *cct;
OSDriver osdriver;
SnapMapper snap_mapper;
+
+ virtual PGBackend *get_pgbackend() = 0;
public:
void update_snap_mapper_bits(uint32_t bits) {
snap_mapper.update_bits(bits);
@@ -439,6 +442,7 @@ protected:
*/
struct BackfillInterval {
// info about a backfill interval on a peer
+ eversion_t version; /// version at which the scan occurred
map<hobject_t,eversion_t> objects;
hobject_t begin;
hobject_t end;
@@ -447,6 +451,7 @@ protected:
void clear() {
objects.clear();
begin = end = hobject_t();
+ version = eversion_t();
}
void reset(hobject_t start) {
@@ -870,8 +875,12 @@ public:
virtual void _scrub(ScrubMap &map) { }
virtual void _scrub_clear_state() { }
virtual void _scrub_finish() { }
- virtual coll_t get_temp_coll() = 0;
- virtual bool have_temp_coll() = 0;
+ virtual void get_colls(list<coll_t> *out) = 0;
+ virtual void split_colls(
+ pg_t child,
+ int split_bits,
+ int seed,
+ ObjectStore::Transaction *t) = 0;
virtual bool _report_snap_collection_errors(
const hobject_t &hoid,
const map<string, bufferptr> &attrs,
@@ -1789,10 +1798,10 @@ public:
// abstract bits
- void do_request(
+ virtual void do_request(
OpRequestRef op,
ThreadPool::TPHandle &handle
- );
+ ) = 0;
virtual void do_op(OpRequestRef op) = 0;
virtual void do_sub_op(OpRequestRef op) = 0;
@@ -1802,9 +1811,6 @@ public:
ThreadPool::TPHandle &handle
) = 0;
virtual void do_backfill(OpRequestRef op) = 0;
- virtual void do_push(OpRequestRef op) = 0;
- virtual void do_pull(OpRequestRef op) = 0;
- virtual void do_push_reply(OpRequestRef op) = 0;
virtual void snap_trimmer() = 0;
virtual int do_command(cmdmap_t cmdmap, ostream& ss,
diff --git a/src/osd/PGBackend.h b/src/osd/PGBackend.h
new file mode 100644
index 00000000000..408c589a08a
--- /dev/null
+++ b/src/osd/PGBackend.h
@@ -0,0 +1,230 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Inktank Storage, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef PGBACKEND_H
+#define PGBACKEND_H
+
+#include "osd_types.h"
+#include "include/Context.h"
+#include <string>
+
+ /**
+ * PGBackend
+ *
+ * PGBackend defines an interface for logic handling IO and
+ * replication on RADOS objects. The PGBackend implementation
+ * is responsible for:
+ *
+ * 1) Handling client operations
+ * 2) Handling object recovery
+ * 3) Handling object access
+ */
+ class PGBackend {
+ public:
+ /**
+ * Provides interfaces for PGBackend callbacks
+ *
+ * The intention is that the parent calls into the PGBackend
+ * implementation holding a lock and that the callbacks are
+ * called under the same locks.
+ */
+ class Listener {
+ public:
+ /// Recovery
+
+ virtual void on_local_recover_start(
+ const hobject_t &oid,
+ ObjectStore::Transaction *t) = 0;
+ /**
+ * Called with the transaction recovering oid
+ */
+ virtual void on_local_recover(
+ const hobject_t &oid,
+ const object_stat_sum_t &stat_diff,
+ const ObjectRecoveryInfo &recovery_info,
+ ObjectContextRef obc,
+ ObjectStore::Transaction *t
+ ) = 0;
+
+ /**
+ * Called when transaction recovering oid is durable and
+ * applied on all replicas
+ */
+ virtual void on_global_recover(const hobject_t &oid) = 0;
+
+ /**
+ * Called when peer is recovered
+ */
+ virtual void on_peer_recover(
+ int peer,
+ const hobject_t &oid,
+ const ObjectRecoveryInfo &recovery_info,
+ const object_stat_sum_t &stat
+ ) = 0;
+
+ virtual void begin_peer_recover(
+ int peer,
+ const hobject_t oid) = 0;
+
+ virtual void failed_push(int from, const hobject_t &soid) = 0;
+
+
+ virtual void cancel_pull(const hobject_t &soid) = 0;
+
+ /**
+ * Bless a context
+ *
+ * Wraps a context in whatever outer layers the parent usually
+ * uses to call into the PGBackend
+ */
+ virtual Context *bless_context(Context *c) = 0;
+ virtual GenContext<ThreadPool::TPHandle&> *bless_gencontext(
+ GenContext<ThreadPool::TPHandle&> *c) = 0;
+
+ virtual void send_message(int to_osd, Message *m) = 0;
+ virtual void queue_transaction(ObjectStore::Transaction *t) = 0;
+ virtual epoch_t get_epoch() = 0;
+ virtual const vector<int> &get_acting() = 0;
+ virtual std::string gen_dbg_prefix() const = 0;
+
+ virtual const map<hobject_t, set<int> > &get_missing_loc() = 0;
+ virtual const map<int, pg_missing_t> &get_peer_missing() = 0;
+ virtual const map<int, pg_info_t> &get_peer_info() = 0;
+ virtual const pg_missing_t &get_local_missing() = 0;
+ virtual const PGLog &get_log() = 0;
+ virtual bool pgb_is_primary() const = 0;
+ virtual OSDMapRef pgb_get_osdmap() const = 0;
+ virtual const pg_info_t &get_info() const = 0;
+
+ virtual ObjectContextRef get_obc(
+ const hobject_t &hoid,
+ map<string, bufferptr> &attrs) = 0;
+
+ virtual ~Listener() {}
+ };
+ Listener *parent;
+ Listener *get_parent() const { return parent; }
+ PGBackend(Listener *l) : parent(l) {}
+ bool is_primary() const { return get_parent()->pgb_is_primary(); }
+ OSDMapRef get_osdmap() const { return get_parent()->pgb_get_osdmap(); }
+ const pg_info_t &get_info() { return get_parent()->get_info(); }
+
+ std::string gen_prefix() const {
+ return parent->gen_dbg_prefix();
+ }
+
+ /**
+ * RecoveryHandle
+ *
+ * We may want to recover multiple objects in the same set of
+ * messages. RecoveryHandle is an interface for the opaque
+ * object used by the implementation to store the details of
+ * the pending recovery operations.
+ */
+ struct RecoveryHandle {
+ virtual ~RecoveryHandle() {}
+ };
+
+ /// Get a fresh recovery operation
+ virtual RecoveryHandle *open_recovery_op() = 0;
+
+ /// run_recovery_op: finish the operation represented by h
+ virtual void run_recovery_op(
+ RecoveryHandle *h, ///< [in] op to finish
+ int priority ///< [in] msg priority
+ ) = 0;
+
+ /**
+ * recover_object
+ *
+ * Triggers a recovery operation on the specified hobject_t
+ * onreadable must be called before onwriteable
+ *
+ * On each replica (primary included), get_parent()->on_not_missing()
+ * must be called when the transaction finalizing the recovery
+ * is queued. Similarly, get_parent()->on_readable() must be called
+ * when the transaction is applied in the backing store.
+ *
+ * get_parent()->on_not_degraded() should be called on the primary
+ * when writes can resume on the object.
+ *
+ * obc may be NULL if the primary lacks the object.
+ *
+ * head may be NULL only if the head/snapdir is missing
+ *
+ * @param missing [in] set of info, missing pairs for queried nodes
+ * @param overlaps [in] mapping of object to file offset overlaps
+ */
+ virtual void recover_object(
+ const hobject_t &hoid, ///< [in] object to recover
+ ObjectContextRef head, ///< [in] context of the head/snapdir object
+ ObjectContextRef obc, ///< [in] context of the object
+ RecoveryHandle *h ///< [in,out] handle to attach recovery op to
+ ) = 0;
+
+ /// gives PGBackend a crack at an incoming message
+ virtual bool handle_message(
+ OpRequestRef op ///< [in] message received
+ ) = 0; ///< @return true if the message was handled
+
+ virtual void check_recovery_sources(const OSDMapRef osdmap) = 0;
+
+ /**
+ * implementation should clear itself, contexts blessed prior to on_change
+ * won't be called after on_change()
+ */
+ virtual void on_change(ObjectStore::Transaction *t) = 0;
+ virtual void clear_state() = 0;
+
+ virtual void on_flushed() = 0;
+
+
+ virtual void split_colls(
+ pg_t child,
+ int split_bits,
+ int seed,
+ ObjectStore::Transaction *t) = 0;
+
+ virtual void temp_colls(list<coll_t> *out) = 0;
+
+ virtual void dump_recovery_info(Formatter *f) const = 0;
+
+ virtual coll_t get_temp_coll(ObjectStore::Transaction *t) = 0;
+ virtual void add_temp_obj(const hobject_t &oid) = 0;
+ virtual void clear_temp_obj(const hobject_t &oid) = 0;
+
+ virtual ~PGBackend() {}
+
+ /// List objects in collection
+ virtual int objects_list_partial(
+ const hobject_t &begin,
+ int min,
+ int max,
+ snapid_t seq,
+ vector<hobject_t> *ls,
+ hobject_t *next) = 0;
+
+ virtual int objects_list_range(
+ const hobject_t &start,
+ const hobject_t &end,
+ snapid_t seq,
+ vector<hobject_t> *ls) = 0;
+
+ virtual int objects_get_attr(
+ const hobject_t &hoid,
+ const string &attr,
+ bufferlist *out) = 0;
+ };
+
+#endif
diff --git a/src/osd/PGLog.cc b/src/osd/PGLog.cc
index 486d64302b9..1949c96fd57 100644
--- a/src/osd/PGLog.cc
+++ b/src/osd/PGLog.cc
@@ -52,13 +52,9 @@ void PGLog::IndexedLog::split_into(
if (log.empty())
tail = head;
- else
- head = log.rbegin()->version;
if (olog->empty())
olog->tail = olog->head;
- else
- olog->head = olog->log.rbegin()->version;
olog->index();
index();
@@ -782,10 +778,6 @@ void PGLog::read_log_old(ObjectStore *store, coll_t coll, hobject_t log_oid,
log.tail = info.log_tail;
- // In case of sobject_t based encoding, may need to list objects in the store
- // to find hashes
- vector<hobject_t> ls;
-
if (ondisklog_head > 0) {
// read
bufferlist bl;
@@ -803,7 +795,6 @@ void PGLog::read_log_old(ObjectStore *store, coll_t coll, hobject_t log_oid,
assert(log.empty());
eversion_t last;
bool reorder = false;
- bool listed_collection = false;
while (!p.end()) {
uint64_t pos = ondisklog_tail + p.get_off();
@@ -846,29 +837,7 @@ void PGLog::read_log_old(ObjectStore *store, coll_t coll, hobject_t log_oid,
<< e.version << " after " << last << "\n";
}
- if (e.invalid_hash) {
- // We need to find the object in the store to get the hash
- if (!listed_collection) {
- store->collection_list(coll, ls);
- listed_collection = true;
- }
- bool found = false;
- for (vector<hobject_t>::iterator i = ls.begin();
- i != ls.end();
- ++i) {
- if (i->oid == e.soid.oid && i->snap == e.soid.snap) {
- e.soid = *i;
- found = true;
- break;
- }
- }
- if (!found) {
- // Didn't find the correct hash
- std::ostringstream oss;
- oss << "Could not find hash for hoid " << e.soid << std::endl;
- throw read_log_error(oss.str().c_str());
- }
- }
+ assert(!e.invalid_hash);
if (e.invalid_pool) {
e.soid.pool = info.pgid.pool();
diff --git a/src/osd/ReplicatedBackend.cc b/src/osd/ReplicatedBackend.cc
new file mode 100644
index 00000000000..9529e15ae77
--- /dev/null
+++ b/src/osd/ReplicatedBackend.cc
@@ -0,0 +1,268 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Inktank Storage, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+#include "ReplicatedBackend.h"
+#include "messages/MOSDSubOp.h"
+#include "messages/MOSDSubOpReply.h"
+#include "messages/MOSDPGPush.h"
+#include "messages/MOSDPGPull.h"
+#include "messages/MOSDPGPushReply.h"
+
+#define dout_subsys ceph_subsys_osd
+#define DOUT_PREFIX_ARGS this
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, this)
+static ostream& _prefix(std::ostream *_dout, ReplicatedBackend *pgb) {
+ return *_dout << pgb->get_parent()->gen_dbg_prefix();
+}
+
+ReplicatedBackend::ReplicatedBackend(
+ PGBackend::Listener *pg, coll_t coll, OSDService *osd) :
+ PGBackend(pg), temp_created(false),
+ temp_coll(coll_t::make_temp_coll(pg->get_info().pgid)),
+ coll(coll), osd(osd), cct(osd->cct) {}
+
+void ReplicatedBackend::run_recovery_op(
+ PGBackend::RecoveryHandle *_h,
+ int priority)
+{
+ RPGHandle *h = static_cast<RPGHandle *>(_h);
+ send_pushes(priority, h->pushes);
+ send_pulls(priority, h->pulls);
+ delete h;
+}
+
+void ReplicatedBackend::recover_object(
+ const hobject_t &hoid,
+ ObjectContextRef head,
+ ObjectContextRef obc,
+ RecoveryHandle *_h
+ )
+{
+ dout(10) << __func__ << ": " << hoid << dendl;
+ RPGHandle *h = static_cast<RPGHandle *>(_h);
+ if (get_parent()->get_local_missing().is_missing(hoid)) {
+ assert(!obc);
+ // pull
+ prepare_pull(
+ hoid,
+ head,
+ h);
+ return;
+ } else {
+ assert(obc);
+ int started = start_pushes(
+ hoid,
+ obc,
+ h);
+ assert(started > 0);
+ }
+}
+
+void ReplicatedBackend::check_recovery_sources(const OSDMapRef osdmap)
+{
+ for(map<int, set<hobject_t> >::iterator i = pull_from_peer.begin();
+ i != pull_from_peer.end();
+ ) {
+ if (osdmap->is_down(i->first)) {
+ dout(10) << "check_recovery_sources resetting pulls from osd." << i->first
+ << ", osdmap has it marked down" << dendl;
+ for (set<hobject_t>::iterator j = i->second.begin();
+ j != i->second.end();
+ ++j) {
+ assert(pulling.count(*j) == 1);
+ get_parent()->cancel_pull(*j);
+ pulling.erase(*j);
+ }
+ pull_from_peer.erase(i++);
+ } else {
+ ++i;
+ }
+ }
+}
+
+bool ReplicatedBackend::handle_message(
+ OpRequestRef op
+ )
+{
+ dout(10) << __func__ << ": " << op << dendl;
+ switch (op->get_req()->get_type()) {
+ case MSG_OSD_PG_PUSH:
+ // TODOXXX: needs to be active possibly
+ do_push(op);
+ return true;
+
+ case MSG_OSD_PG_PULL:
+ do_pull(op);
+ return true;
+
+ case MSG_OSD_PG_PUSH_REPLY:
+ do_push_reply(op);
+ return true;
+
+ case MSG_OSD_SUBOP: {
+ MOSDSubOp *m = static_cast<MOSDSubOp*>(op->get_req());
+ if (m->ops.size() >= 1) {
+ OSDOp *first = &m->ops[0];
+ switch (first->op.op) {
+ case CEPH_OSD_OP_PULL:
+ sub_op_pull(op);
+ return true;
+ case CEPH_OSD_OP_PUSH:
+ // TODOXXX: needs to be active possibly
+ sub_op_push(op);
+ return true;
+ default:
+ break;
+ }
+ }
+ break;
+ }
+
+ case MSG_OSD_SUBOPREPLY: {
+ MOSDSubOpReply *r = static_cast<MOSDSubOpReply*>(op->get_req());
+ if (r->ops.size() >= 1) {
+ OSDOp &first = r->ops[0];
+ switch (first.op.op) {
+ case CEPH_OSD_OP_PUSH:
+ // continue peer recovery
+ sub_op_push_reply(op);
+ return true;
+ }
+ }
+ break;
+ }
+
+ default:
+ break;
+ }
+ return false;
+}
+
+void ReplicatedBackend::clear_state()
+{
+ // clear pushing/pulling maps
+ pushing.clear();
+ pulling.clear();
+ pull_from_peer.clear();
+}
+
+void ReplicatedBackend::on_change(ObjectStore::Transaction *t)
+{
+ dout(10) << __func__ << dendl;
+ // clear temp
+ for (set<hobject_t>::iterator i = temp_contents.begin();
+ i != temp_contents.end();
+ ++i) {
+ dout(10) << __func__ << ": Removing oid "
+ << *i << " from the temp collection" << dendl;
+ t->remove(get_temp_coll(t), *i);
+ }
+ temp_contents.clear();
+ clear_state();
+}
+
+coll_t ReplicatedBackend::get_temp_coll(ObjectStore::Transaction *t)
+{
+ if (temp_created)
+ return temp_coll;
+ if (!osd->store->collection_exists(temp_coll))
+ t->create_collection(temp_coll);
+ temp_created = true;
+ return temp_coll;
+}
+
+void ReplicatedBackend::on_flushed()
+{
+ if (have_temp_coll() &&
+ !osd->store->collection_empty(get_temp_coll())) {
+ vector<hobject_t> objects;
+ osd->store->collection_list(get_temp_coll(), objects);
+ derr << __func__ << ": found objects in the temp collection: "
+ << objects << ", crashing now"
+ << dendl;
+ assert(0 == "found garbage in the temp collection");
+ }
+}
+
+
+int ReplicatedBackend::objects_list_partial(
+ const hobject_t &begin,
+ int min,
+ int max,
+ snapid_t seq,
+ vector<hobject_t> *ls,
+ hobject_t *next)
+{
+ vector<ghobject_t> objects;
+ ghobject_t _next;
+ int r = osd->store->collection_list_partial(
+ coll,
+ begin,
+ min,
+ max,
+ seq,
+ &objects,
+ &_next);
+ ls->reserve(objects.size());
+ for (vector<ghobject_t>::iterator i = objects.begin();
+ i != objects.end();
+ ++i) {
+ assert(i->is_degenerate());
+ ls->push_back(i->hobj);
+ }
+ assert(_next.is_degenerate());
+ *next = _next.hobj;
+ return r;
+}
+
+int ReplicatedBackend::objects_list_range(
+ const hobject_t &start,
+ const hobject_t &end,
+ snapid_t seq,
+ vector<hobject_t> *ls)
+{
+ vector<ghobject_t> objects;
+ int r = osd->store->collection_list_range(
+ coll,
+ start,
+ end,
+ seq,
+ &objects);
+ ls->reserve(objects.size());
+ for (vector<ghobject_t>::iterator i = objects.begin();
+ i != objects.end();
+ ++i) {
+ assert(i->is_degenerate());
+ ls->push_back(i->hobj);
+ }
+ return r;
+}
+
+int ReplicatedBackend::objects_get_attr(
+ const hobject_t &hoid,
+ const string &attr,
+ bufferlist *out)
+{
+ bufferptr bp;
+ int r = osd->store->getattr(
+ coll,
+ hoid,
+ attr.c_str(),
+ bp);
+ if (r >= 0 && out) {
+ out->clear();
+ out->push_back(bp);
+ }
+ return r;
+}
diff --git a/src/osd/ReplicatedBackend.h b/src/osd/ReplicatedBackend.h
new file mode 100644
index 00000000000..cc5f060e136
--- /dev/null
+++ b/src/osd/ReplicatedBackend.h
@@ -0,0 +1,329 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Inktank Storage, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef REPBACKEND_H
+#define REPBACKEND_H
+
+#include "OSD.h"
+#include "PGBackend.h"
+#include "osd_types.h"
+
+struct C_ReplicatedBackend_OnPullComplete;
+class ReplicatedBackend : public PGBackend {
+ struct RPGHandle : public PGBackend::RecoveryHandle {
+ map<int, vector<PushOp> > pushes;
+ map<int, vector<PullOp> > pulls;
+ };
+ friend struct C_ReplicatedBackend_OnPullComplete;
+private:
+ bool temp_created;
+ const coll_t temp_coll;
+ coll_t get_temp_coll() const {
+ return temp_coll;
+ }
+ bool have_temp_coll() const { return temp_created; }
+
+ // Track contents of temp collection, clear on reset
+ set<hobject_t> temp_contents;
+public:
+ coll_t coll;
+ OSDService *osd;
+ CephContext *cct;
+
+ ReplicatedBackend(PGBackend::Listener *pg, coll_t coll, OSDService *osd);
+
+ /// @see PGBackend::open_recovery_op
+ RPGHandle *_open_recovery_op() {
+ return new RPGHandle();
+ }
+ PGBackend::RecoveryHandle *open_recovery_op() {
+ return _open_recovery_op();
+ }
+
+ /// @see PGBackend::run_recovery_op
+ void run_recovery_op(
+ PGBackend::RecoveryHandle *h,
+ int priority);
+
+ /// @see PGBackend::recover_object
+ void recover_object(
+ const hobject_t &hoid,
+ ObjectContextRef head,
+ ObjectContextRef obc,
+ RecoveryHandle *h
+ );
+
+ void check_recovery_sources(const OSDMapRef osdmap);
+
+ /// @see PGBackend::handle_message
+ bool handle_message(
+ OpRequestRef op
+ );
+
+ void on_change(ObjectStore::Transaction *t);
+ void clear_state();
+ void on_flushed();
+
+ void temp_colls(list<coll_t> *out) {
+ if (temp_created)
+ out->push_back(temp_coll);
+ }
+ void split_colls(
+ pg_t child,
+ int split_bits,
+ int seed,
+ ObjectStore::Transaction *t) {
+ coll_t target = coll_t::make_temp_coll(child);
+ if (!temp_created)
+ return;
+ t->create_collection(target);
+ t->split_collection(
+ temp_coll,
+ split_bits,
+ seed,
+ target);
+ }
+
+ virtual void dump_recovery_info(Formatter *f) const {
+ {
+ f->open_array_section("pull_from_peer");
+ for (map<int, set<hobject_t> >::const_iterator i = pull_from_peer.begin();
+ i != pull_from_peer.end();
+ ++i) {
+ f->open_object_section("pulling_from");
+ f->dump_int("pull_from", i->first);
+ {
+ f->open_array_section("pulls");
+ for (set<hobject_t>::const_iterator j = i->second.begin();
+ j != i->second.end();
+ ++j) {
+ f->open_object_section("pull_info");
+ assert(pulling.count(*j));
+ pulling.find(*j)->second.dump(f);
+ f->close_section();
+ }
+ f->close_section();
+ }
+ f->close_section();
+ }
+ f->close_section();
+ }
+ {
+ f->open_array_section("pushing");
+ for (map<hobject_t, map<int, PushInfo> >::const_iterator i =
+ pushing.begin();
+ i != pushing.end();
+ ++i) {
+ f->open_object_section("object");
+ f->dump_stream("pushing") << i->first;
+ {
+ f->open_array_section("pushing_to");
+ for (map<int, PushInfo>::const_iterator j = i->second.begin();
+ j != i->second.end();
+ ++j) {
+ f->open_object_section("push_progress");
+ f->dump_stream("object_pushing") << j->first;
+ {
+ f->open_object_section("push_info");
+ j->second.dump(f);
+ f->close_section();
+ }
+ f->close_section();
+ }
+ f->close_section();
+ }
+ f->close_section();
+ }
+ f->close_section();
+ }
+ }
+
+ /// List objects in collection
+ int objects_list_partial(
+ const hobject_t &begin,
+ int min,
+ int max,
+ snapid_t seq,
+ vector<hobject_t> *ls,
+ hobject_t *next);
+
+ int objects_list_range(
+ const hobject_t &start,
+ const hobject_t &end,
+ snapid_t seq,
+ vector<hobject_t> *ls);
+
+ int objects_get_attr(
+ const hobject_t &hoid,
+ const string &attr,
+ bufferlist *out);
+private:
+ // push
+ struct PushInfo {
+ ObjectRecoveryProgress recovery_progress;
+ ObjectRecoveryInfo recovery_info;
+ ObjectContextRef obc;
+ object_stat_sum_t stat;
+
+ void dump(Formatter *f) const {
+ {
+ f->open_object_section("recovery_progress");
+ recovery_progress.dump(f);
+ f->close_section();
+ }
+ {
+ f->open_object_section("recovery_info");
+ recovery_info.dump(f);
+ f->close_section();
+ }
+ }
+ };
+ map<hobject_t, map<int, PushInfo> > pushing;
+
+ // pull
+ struct PullInfo {
+ ObjectRecoveryProgress recovery_progress;
+ ObjectRecoveryInfo recovery_info;
+ ObjectContextRef head_ctx;
+ ObjectContextRef obc;
+ object_stat_sum_t stat;
+
+ void dump(Formatter *f) const {
+ {
+ f->open_object_section("recovery_progress");
+ recovery_progress.dump(f);
+ f->close_section();
+ }
+ {
+ f->open_object_section("recovery_info");
+ recovery_info.dump(f);
+ f->close_section();
+ }
+ }
+
+ bool is_complete() const {
+ return recovery_progress.is_complete(recovery_info);
+ }
+ };
+
+ coll_t get_temp_coll(ObjectStore::Transaction *t);
+ void add_temp_obj(const hobject_t &oid) {
+ temp_contents.insert(oid);
+ }
+ void clear_temp_obj(const hobject_t &oid) {
+ temp_contents.erase(oid);
+ }
+
+ map<hobject_t, PullInfo> pulling;
+
+ // Reverse mapping from osd peer to objects beging pulled from that peer
+ map<int, set<hobject_t> > pull_from_peer;
+
+ void sub_op_push(OpRequestRef op);
+ void sub_op_push_reply(OpRequestRef op);
+ void sub_op_pull(OpRequestRef op);
+
+ void _do_push(OpRequestRef op);
+ void _do_pull_response(OpRequestRef op);
+ void do_push(OpRequestRef op) {
+ if (is_primary()) {
+ _do_pull_response(op);
+ } else {
+ _do_push(op);
+ }
+ }
+ void do_pull(OpRequestRef op);
+ void do_push_reply(OpRequestRef op);
+
+ bool handle_push_reply(int peer, PushReplyOp &op, PushOp *reply);
+ void handle_pull(int peer, PullOp &op, PushOp *reply);
+ bool handle_pull_response(
+ int from, PushOp &op, PullOp *response,
+ list<ObjectContextRef> *to_continue,
+ ObjectStore::Transaction *t);
+ void handle_push(int from, PushOp &op, PushReplyOp *response,
+ ObjectStore::Transaction *t);
+
+ static void trim_pushed_data(const interval_set<uint64_t> &copy_subset,
+ const interval_set<uint64_t> &intervals_received,
+ bufferlist data_received,
+ interval_set<uint64_t> *intervals_usable,
+ bufferlist *data_usable);
+ void _failed_push(int from, const hobject_t &soid);
+
+ void send_pushes(int prio, map<int, vector<PushOp> > &pushes);
+ void prep_push_op_blank(const hobject_t& soid, PushOp *op);
+ int send_push_op_legacy(int priority, int peer,
+ PushOp &pop);
+ int send_pull_legacy(int priority, int peer,
+ const ObjectRecoveryInfo& recovery_info,
+ ObjectRecoveryProgress progress);
+ void send_pulls(
+ int priority,
+ map<int, vector<PullOp> > &pulls);
+
+ int build_push_op(const ObjectRecoveryInfo &recovery_info,
+ const ObjectRecoveryProgress &progress,
+ ObjectRecoveryProgress *out_progress,
+ PushOp *out_op,
+ object_stat_sum_t *stat = 0);
+ void submit_push_data(ObjectRecoveryInfo &recovery_info,
+ bool first,
+ bool complete,
+ const interval_set<uint64_t> &intervals_included,
+ bufferlist data_included,
+ bufferlist omap_header,
+ map<string, bufferptr> &attrs,
+ map<string, bufferlist> &omap_entries,
+ ObjectStore::Transaction *t);
+ void submit_push_complete(ObjectRecoveryInfo &recovery_info,
+ ObjectStore::Transaction *t);
+
+ void calc_clone_subsets(
+ SnapSet& snapset, const hobject_t& poid, const pg_missing_t& missing,
+ const hobject_t &last_backfill,
+ interval_set<uint64_t>& data_subset,
+ map<hobject_t, interval_set<uint64_t> >& clone_subsets);
+ void prepare_pull(
+ const hobject_t& soid,
+ ObjectContextRef headctx,
+ RPGHandle *h);
+ int start_pushes(
+ const hobject_t &soid,
+ ObjectContextRef obj,
+ RPGHandle *h);
+ void prep_push_to_replica(
+ ObjectContextRef obc, const hobject_t& soid, int peer,
+ PushOp *pop);
+ void prep_push(ObjectContextRef obc,
+ const hobject_t& oid, int dest,
+ PushOp *op);
+ void prep_push(ObjectContextRef obc,
+ const hobject_t& soid, int peer,
+ eversion_t version,
+ interval_set<uint64_t> &data_subset,
+ map<hobject_t, interval_set<uint64_t> >& clone_subsets,
+ PushOp *op);
+ void calc_head_subsets(ObjectContextRef obc, SnapSet& snapset, const hobject_t& head,
+ const pg_missing_t& missing,
+ const hobject_t &last_backfill,
+ interval_set<uint64_t>& data_subset,
+ map<hobject_t, interval_set<uint64_t> >& clone_subsets);
+ ObjectRecoveryInfo recalc_subsets(
+ const ObjectRecoveryInfo& recovery_info,
+ SnapSetContext *ssc
+ );
+};
+
+#endif
diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc
index a48372fe561..f466eb8ccdc 100644
--- a/src/osd/ReplicatedPG.cc
+++ b/src/osd/ReplicatedPG.cc
@@ -60,8 +60,9 @@
#define dout_subsys ceph_subsys_osd
#define DOUT_PREFIX_ARGS this, osd->whoami, get_osdmap()
#undef dout_prefix
-#define dout_prefix _prefix(_dout, this, osd->whoami, get_osdmap())
-static ostream& _prefix(std::ostream *_dout, PG *pg, int whoami, OSDMapRef osdmap) {
+#define dout_prefix _prefix(_dout, this)
+template <typename T>
+static ostream& _prefix(std::ostream *_dout, T *pg) {
return *_dout << pg->gen_prefix();
}
@@ -79,6 +80,159 @@ PGLSFilter::~PGLSFilter()
{
}
+static void log_subop_stats(
+ OSDService *osd,
+ OpRequestRef op, int tag_inb, int tag_lat)
+{
+ utime_t now = ceph_clock_now(g_ceph_context);
+ utime_t latency = now;
+ latency -= op->get_req()->get_recv_stamp();
+
+ uint64_t inb = op->get_req()->get_data().length();
+
+ osd->logger->inc(l_osd_sop);
+
+ osd->logger->inc(l_osd_sop_inb, inb);
+ osd->logger->tinc(l_osd_sop_lat, latency);
+
+ if (tag_inb)
+ osd->logger->inc(tag_inb, inb);
+ osd->logger->tinc(tag_lat, latency);
+}
+
+// ======================
+// PGBackend::Listener
+
+
+void ReplicatedPG::on_local_recover_start(
+ const hobject_t &oid,
+ ObjectStore::Transaction *t)
+{
+ pg_log.revise_have(oid, eversion_t());
+ remove_snap_mapped_object(*t, oid);
+ t->remove(coll, oid);
+}
+
+void ReplicatedPG::on_local_recover(
+ const hobject_t &hoid,
+ const object_stat_sum_t &stat_diff,
+ const ObjectRecoveryInfo &_recovery_info,
+ ObjectContextRef obc,
+ ObjectStore::Transaction *t
+ )
+{
+ ObjectRecoveryInfo recovery_info(_recovery_info);
+ if (recovery_info.soid.snap < CEPH_NOSNAP) {
+ assert(recovery_info.oi.snaps.size());
+ OSDriver::OSTransaction _t(osdriver.get_transaction(t));
+ set<snapid_t> snaps(
+ recovery_info.oi.snaps.begin(),
+ recovery_info.oi.snaps.end());
+ snap_mapper.add_oid(
+ recovery_info.soid,
+ snaps,
+ &_t);
+ }
+
+ if (pg_log.get_missing().is_missing(recovery_info.soid) &&
+ pg_log.get_missing().missing.find(recovery_info.soid)->second.need > recovery_info.version) {
+ assert(is_primary());
+ const pg_log_entry_t *latest = pg_log.get_log().objects.find(recovery_info.soid)->second;
+ if (latest->op == pg_log_entry_t::LOST_REVERT &&
+ latest->reverting_to == recovery_info.version) {
+ dout(10) << " got old revert version " << recovery_info.version
+ << " for " << *latest << dendl;
+ recovery_info.version = latest->version;
+ // update the attr to the revert event version
+ recovery_info.oi.prior_version = recovery_info.oi.version;
+ recovery_info.oi.version = latest->version;
+ bufferlist bl;
+ ::encode(recovery_info.oi, bl);
+ t->setattr(coll, recovery_info.soid, OI_ATTR, bl);
+ }
+ }
+
+ // keep track of active pushes for scrub
+ ++active_pushes;
+
+ recover_got(recovery_info.soid, recovery_info.version);
+
+ if (is_primary()) {
+ info.stats.stats.sum.add(stat_diff);
+
+ assert(obc);
+ obc->obs.exists = true;
+ obc->ondisk_write_lock();
+ obc->obs.oi = recovery_info.oi; // may have been updated above
+
+
+ t->register_on_applied(new C_OSD_AppliedRecoveredObject(this, obc));
+ t->register_on_applied_sync(new C_OSD_OndiskWriteUnlock(obc));
+
+ publish_stats_to_osd();
+ if (waiting_for_missing_object.count(hoid)) {
+ dout(20) << " kicking waiters on " << hoid << dendl;
+ requeue_ops(waiting_for_missing_object[hoid]);
+ waiting_for_missing_object.erase(hoid);
+ if (pg_log.get_missing().missing.size() == 0) {
+ requeue_ops(waiting_for_all_missing);
+ waiting_for_all_missing.clear();
+ }
+ }
+ } else {
+ t->register_on_applied(
+ new C_OSD_AppliedRecoveredObjectReplica(this));
+
+ }
+
+ t->register_on_commit(
+ new C_OSD_CommittedPushedObject(
+ this,
+ get_osdmap()->get_epoch(),
+ info.last_complete));
+
+ // update pg
+ dirty_info = true;
+ write_if_dirty(*t);
+
+}
+
+void ReplicatedPG::on_global_recover(
+ const hobject_t &soid)
+{
+ publish_stats_to_osd();
+ dout(10) << "pushed " << soid << " to all replicas" << dendl;
+ assert(recovering.count(soid));
+ recovering.erase(soid);
+ finish_recovery_op(soid);
+ if (waiting_for_degraded_object.count(soid)) {
+ requeue_ops(waiting_for_degraded_object[soid]);
+ waiting_for_degraded_object.erase(soid);
+ }
+ finish_degraded_object(soid);
+}
+
+void ReplicatedPG::on_peer_recover(
+ int peer,
+ const hobject_t &soid,
+ const ObjectRecoveryInfo &recovery_info,
+ const object_stat_sum_t &stat)
+{
+ info.stats.stats.sum.add(stat);
+ publish_stats_to_osd();
+ // done!
+ peer_missing[peer].got(soid, recovery_info.version);
+ if (peer == backfill_target && backfills_in_flight.count(soid))
+ backfills_in_flight.erase(soid);
+}
+
+void ReplicatedPG::begin_peer_recover(
+ int peer,
+ const hobject_t soid)
+{
+ peer_missing[peer].revise_have(soid, eversion_t());
+}
+
// =======================
// pg changes
@@ -117,18 +271,18 @@ void ReplicatedPG::wait_for_missing_object(const hobject_t& soid, OpRequestRef o
assert(g != missing.missing.end());
const eversion_t &v(g->second.need);
- map<hobject_t, PullInfo>::const_iterator p = pulling.find(soid);
- if (p != pulling.end()) {
- dout(7) << "missing " << soid << " v " << v << ", already pulling." << dendl;
+ set<hobject_t>::const_iterator p = recovering.find(soid);
+ if (p != recovering.end()) {
+ dout(7) << "missing " << soid << " v " << v << ", already recovering." << dendl;
}
else if (missing_loc.find(soid) == missing_loc.end()) {
dout(7) << "missing " << soid << " v " << v << ", is unfound." << dendl;
}
else {
- dout(7) << "missing " << soid << " v " << v << ", pulling." << dendl;
- map<int, vector<PullOp> > pulls;
- prepare_pull(soid, v, cct->_conf->osd_client_op_priority, &pulls);
- send_pulls(cct->_conf->osd_client_op_priority, pulls);
+ dout(7) << "missing " << soid << " v " << v << ", recovering." << dendl;
+ PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
+ recover_missing(soid, v, cct->_conf->osd_client_op_priority, h);
+ pgbackend->run_recovery_op(h, cct->_conf->osd_client_op_priority);
}
waiting_for_missing_object[soid].push_back(op);
op->mark_delayed("waiting for missing object");
@@ -165,15 +319,15 @@ void ReplicatedPG::wait_for_degraded_object(const hobject_t& soid, OpRequestRef
assert(is_degraded_object(soid));
// we don't have it (yet).
- if (pushing.count(soid)) {
+ if (recovering.count(soid)) {
dout(7) << "degraded "
<< soid
- << ", already pushing"
+ << ", already recovering"
<< dendl;
} else {
dout(7) << "degraded "
<< soid
- << ", pushing"
+ << ", recovering"
<< dendl;
eversion_t v;
for (unsigned i = 1; i < acting.size(); i++) {
@@ -184,9 +338,9 @@ void ReplicatedPG::wait_for_degraded_object(const hobject_t& soid, OpRequestRef
break;
}
}
- map<int, vector<PushOp> > pushes;
- prep_object_replica_pushes(soid, v, cct->_conf->osd_client_op_priority, &pushes);
- send_pushes(cct->_conf->osd_client_op_priority, pushes);
+ PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
+ prep_object_replica_pushes(soid, v, h);
+ pgbackend->run_recovery_op(h, cct->_conf->osd_client_op_priority);
}
waiting_for_degraded_object[soid].push_back(op);
op->mark_delayed("waiting for degraded object");
@@ -244,8 +398,10 @@ bool PGLSPlainFilter::filter(bufferlist& xattr_data, bufferlist& outdata)
bool ReplicatedPG::pgls_filter(PGLSFilter *filter, hobject_t& sobj, bufferlist& outdata)
{
bufferlist bl;
-
- int ret = osd->store->getattr(coll_t(info.pgid), sobj, filter->get_xattr().c_str(), bl);
+ int ret = pgbackend->objects_get_attr(
+ sobj,
+ filter->get_xattr(),
+ &bl);
dout(0) << "getattr (sobj=" << sobj << ", attr=" << filter->get_xattr() << ") returned " << ret << dendl;
if (ret < 0)
return false;
@@ -427,7 +583,7 @@ bool ReplicatedPG::pg_op_must_wait(MOSDOp *op)
void ReplicatedPG::do_pg_op(OpRequestRef op)
{
- MOSDOp *m = static_cast<MOSDOp *>(op->request);
+ MOSDOp *m = static_cast<MOSDOp *>(op->get_req());
assert(m->get_header().type == CEPH_MSG_OSD_OP);
dout(10) << "do_pg_op " << *m << dendl;
@@ -485,12 +641,13 @@ void ReplicatedPG::do_pg_op(OpRequestRef op)
hobject_t next;
hobject_t current = response.handle;
osr->flush();
- int r = osd->store->collection_list_partial(coll, current,
- list_size,
- list_size,
- snapid,
- &sentries,
- &next);
+ int r = pgbackend->objects_list_partial(
+ current,
+ list_size,
+ list_size,
+ snapid,
+ &sentries,
+ &next);
if (r != 0) {
result = -EINVAL;
break;
@@ -528,13 +685,17 @@ void ReplicatedPG::do_pg_op(OpRequestRef op)
if (snapid != CEPH_NOSNAP) {
bufferlist bl;
if (candidate.snap == CEPH_NOSNAP) {
- osd->store->getattr(coll, candidate, SS_ATTR, bl);
+ pgbackend->objects_get_attr(
+ candidate,
+ SS_ATTR,
+ &bl);
SnapSet snapset(bl);
if (snapid <= snapset.seq)
continue;
} else {
bufferlist attr_bl;
- osd->store->getattr(coll, candidate, OI_ATTR, attr_bl);
+ pgbackend->objects_get_attr(
+ candidate, OI_ATTR, &attr_bl);
object_info_t oi(attr_bl);
vector<snapid_t>::iterator i = find(oi.snaps.begin(),
oi.snaps.end(),
@@ -628,9 +789,8 @@ ReplicatedPG::ReplicatedPG(OSDService *o, OSDMapRef curmap,
const PGPool &_pool, pg_t p, const hobject_t& oid,
const hobject_t& ioid) :
PG(o, curmap, _pool, p, oid, ioid),
+ pgbackend(new ReplicatedBackend(this, coll_t(p), o)),
snapset_contexts_lock("ReplicatedPG::snapset_contexts"),
- temp_created(false),
- temp_coll(coll_t::make_temp_coll(p)),
temp_seq(0),
snap_trimmer_machine(this)
{
@@ -644,13 +804,69 @@ void ReplicatedPG::get_src_oloc(const object_t& oid, const object_locator_t& olo
src_oloc.key = oid.name;
}
+void ReplicatedPG::do_request(
+ OpRequestRef op,
+ ThreadPool::TPHandle &handle)
+{
+ // do any pending flush
+ do_pending_flush();
+
+ if (!op_has_sufficient_caps(op)) {
+ osd->reply_op_error(op, -EPERM);
+ return;
+ }
+ assert(!op_must_wait_for_map(get_osdmap(), op));
+ if (can_discard_request(op)) {
+ return;
+ }
+ if (!flushed) {
+ dout(20) << " !flushed, waiting for active on " << op << dendl;
+ waiting_for_active.push_back(op);
+ return;
+ }
+
+ if (pgbackend->handle_message(op))
+ return;
+
+ switch (op->get_req()->get_type()) {
+ case CEPH_MSG_OSD_OP:
+ if (is_replay() || !is_active()) {
+ dout(20) << " replay, waiting for active on " << op << dendl;
+ waiting_for_active.push_back(op);
+ return;
+ }
+ do_op(op); // do it now
+ break;
+
+ case MSG_OSD_SUBOP:
+ do_sub_op(op);
+ break;
+
+ case MSG_OSD_SUBOPREPLY:
+ do_sub_op_reply(op);
+ break;
+
+ case MSG_OSD_PG_SCAN:
+ do_scan(op, handle);
+ break;
+
+ case MSG_OSD_PG_BACKFILL:
+ do_backfill(op);
+ break;
+
+ default:
+ assert(0 == "bad message type in do_request");
+ }
+}
+
+
/** do_op - do an op
* pg lock will be held (if multithreaded)
* osd_lock NOT held.
*/
void ReplicatedPG::do_op(OpRequestRef op)
{
- MOSDOp *m = static_cast<MOSDOp*>(op->request);
+ MOSDOp *m = static_cast<MOSDOp*>(op->get_req());
assert(m->get_header().type == CEPH_MSG_OSD_OP);
if (op->includes_pg_op()) {
if (pg_op_must_wait(m)) {
@@ -660,13 +876,21 @@ void ReplicatedPG::do_op(OpRequestRef op)
return do_pg_op(op);
}
- dout(10) << "do_op " << *m << (op->may_write() ? " may_write" : "") << dendl;
+ // order this op as a write?
+ bool write_ordered = op->may_write() || (m->get_flags() & CEPH_OSD_FLAG_RWORDERED);
+
+ dout(10) << "do_op " << *m
+ << (op->may_write() ? " may_write" : "")
+ << (op->may_read() ? " may_read" : "")
+ << " -> " << (write_ordered ? "write-ordered" : "read-ordered")
+ << dendl;
hobject_t head(m->get_oid(), m->get_object_locator().key,
CEPH_NOSNAP, m->get_pg().ps(),
info.pgid.pool(), m->get_object_locator().nspace);
- if (op->may_write() && scrubber.write_blocked_by_scrub(head)) {
+
+ if (write_ordered && scrubber.write_blocked_by_scrub(head)) {
dout(20) << __func__ << ": waiting for scrub" << dendl;
waiting_for_active.push_back(op);
op->mark_delayed("waiting for scrub");
@@ -680,7 +904,7 @@ void ReplicatedPG::do_op(OpRequestRef op)
}
// degraded object?
- if (op->may_write() && is_degraded_object(head)) {
+ if (write_ordered && is_degraded_object(head)) {
wait_for_degraded_object(head, op);
return;
}
@@ -700,7 +924,7 @@ void ReplicatedPG::do_op(OpRequestRef op)
}
// degraded object?
- if (op->may_write() && is_degraded_object(snapdir)) {
+ if (write_ordered && is_degraded_object(snapdir)) {
wait_for_degraded_object(snapdir, op);
return;
}
@@ -764,7 +988,7 @@ void ReplicatedPG::do_op(OpRequestRef op)
return;
}
- if ((op->may_read()) && (obc->obs.oi.lost)) {
+ if ((op->may_read()) && (obc->obs.oi.is_lost())) {
// This object is lost. Reading from it returns an error.
dout(20) << __func__ << ": object " << obc->obs.oi.soid
<< " is lost" << dendl;
@@ -774,7 +998,8 @@ void ReplicatedPG::do_op(OpRequestRef op)
dout(25) << __func__ << ": object " << obc->obs.oi.soid
<< " has oi of " << obc->obs.oi << dendl;
- if (!op->may_write() && !obc->obs.exists) {
+ if (!op->may_write() && (!obc->obs.exists ||
+ obc->obs.oi.is_whiteout())) {
osd->reply_op_error(op, -ENOENT);
return;
}
@@ -831,6 +1056,8 @@ void ReplicatedPG::do_op(OpRequestRef op)
wait_for_missing_object(wait_oid, op);
} else if (r) {
osd->reply_op_error(op, r);
+ } else if (sobc->obs.oi.is_whiteout()) {
+ osd->reply_op_error(op, -ENOENT);
} else {
if (sobc->obs.oi.soid.get_key() != obc->obs.oi.soid.get_key() &&
sobc->obs.oi.soid.get_key() != obc->obs.oi.soid.oid.name &&
@@ -885,6 +1112,8 @@ void ReplicatedPG::do_op(OpRequestRef op)
wait_for_missing_object(wait_oid, op);
} else if (r) {
osd->reply_op_error(op, r);
+ } else if (sobc->obs.oi.is_whiteout()) {
+ osd->reply_op_error(op, -ENOENT);
} else {
dout(10) << " clone_oid " << clone_oid << " obc " << sobc << dendl;
src_obc[clone_oid] = sobc;
@@ -943,7 +1172,7 @@ bool ReplicatedPG::maybe_handle_cache(OpRequestRef op, ObjectContextRef obc,
void ReplicatedPG::do_cache_redirect(OpRequestRef op, ObjectContextRef obc)
{
- MOSDOp *m = static_cast<MOSDOp*>(op->request);
+ MOSDOp *m = static_cast<MOSDOp*>(op->get_req());
int flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK);
MOSDOpReply *reply = new MOSDOpReply(m, -ENOENT,
get_osdmap()->get_epoch(), flags);
@@ -959,7 +1188,7 @@ void ReplicatedPG::execute_ctx(OpContext *ctx)
{
dout(10) << __func__ << " " << ctx << dendl;
OpRequestRef op = ctx->op;
- MOSDOp *m = static_cast<MOSDOp*>(op->request);
+ MOSDOp *m = static_cast<MOSDOp*>(op->get_req());
ObjectContextRef obc = ctx->obc;
const hobject_t& soid = obc->obs.oi.soid;
map<hobject_t,ObjectContextRef>& src_obc = ctx->src_obc;
@@ -1183,16 +1412,16 @@ void ReplicatedPG::reply_ctx(OpContext *ctx, int r, eversion_t v, version_t uv)
void ReplicatedPG::log_op_stats(OpContext *ctx)
{
OpRequestRef op = ctx->op;
- MOSDOp *m = static_cast<MOSDOp*>(op->request);
+ MOSDOp *m = static_cast<MOSDOp*>(op->get_req());
utime_t now = ceph_clock_now(cct);
utime_t latency = now;
- latency -= ctx->op->request->get_recv_stamp();
+ latency -= ctx->op->get_req()->get_recv_stamp();
utime_t rlatency;
if (ctx->readable_stamp != utime_t()) {
rlatency = ctx->readable_stamp;
- rlatency -= ctx->op->request->get_recv_stamp();
+ rlatency -= ctx->op->get_req()->get_recv_stamp();
}
uint64_t inb = ctx->bytes_written;
@@ -1229,41 +1458,16 @@ void ReplicatedPG::log_op_stats(OpContext *ctx)
<< " lat " << latency << dendl;
}
-void ReplicatedPG::log_subop_stats(OpRequestRef op, int tag_inb, int tag_lat)
-{
- utime_t now = ceph_clock_now(cct);
- utime_t latency = now;
- latency -= op->request->get_recv_stamp();
-
- uint64_t inb = op->request->get_data().length();
-
- osd->logger->inc(l_osd_sop);
-
- osd->logger->inc(l_osd_sop_inb, inb);
- osd->logger->tinc(l_osd_sop_lat, latency);
-
- if (tag_inb)
- osd->logger->inc(tag_inb, inb);
- osd->logger->tinc(tag_lat, latency);
-}
-
-
-
void ReplicatedPG::do_sub_op(OpRequestRef op)
{
- MOSDSubOp *m = static_cast<MOSDSubOp*>(op->request);
+ MOSDSubOp *m = static_cast<MOSDSubOp*>(op->get_req());
assert(have_same_or_newer_map(m->map_epoch));
assert(m->get_header().type == MSG_OSD_SUBOP);
- dout(15) << "do_sub_op " << *op->request << dendl;
+ dout(15) << "do_sub_op " << *op->get_req() << dendl;
OSDOp *first = NULL;
if (m->ops.size() >= 1) {
first = &m->ops[0];
- switch (first->op.op) {
- case CEPH_OSD_OP_PULL:
- sub_op_pull(op);
- return;
- }
}
if (!is_active()) {
@@ -1274,9 +1478,6 @@ void ReplicatedPG::do_sub_op(OpRequestRef op)
if (first) {
switch (first->op.op) {
- case CEPH_OSD_OP_PUSH:
- sub_op_push(op);
- return;
case CEPH_OSD_OP_DELETE:
sub_op_remove(op);
return;
@@ -1300,16 +1501,11 @@ void ReplicatedPG::do_sub_op(OpRequestRef op)
void ReplicatedPG::do_sub_op_reply(OpRequestRef op)
{
- MOSDSubOpReply *r = static_cast<MOSDSubOpReply *>(op->request);
+ MOSDSubOpReply *r = static_cast<MOSDSubOpReply *>(op->get_req());
assert(r->get_header().type == MSG_OSD_SUBOPREPLY);
if (r->ops.size() >= 1) {
OSDOp& first = r->ops[0];
switch (first.op.op) {
- case CEPH_OSD_OP_PUSH:
- // continue peer recovery
- sub_op_push_reply(op);
- return;
-
case CEPH_OSD_OP_SCRUB_RESERVE:
sub_op_scrub_reserve_reply(op);
return;
@@ -1323,7 +1519,7 @@ void ReplicatedPG::do_scan(
OpRequestRef op,
ThreadPool::TPHandle &handle)
{
- MOSDPGScan *m = static_cast<MOSDPGScan*>(op->request);
+ MOSDPGScan *m = static_cast<MOSDPGScan*>(op->get_req());
assert(m->get_header().type == MSG_OSD_PG_SCAN);
dout(10) << "do_scan " << *m << dendl;
@@ -1347,8 +1543,9 @@ void ReplicatedPG::do_scan(
BackfillInterval bi;
osr->flush();
+ bi.begin = m->begin;
scan_range(
- m->begin, cct->_conf->osd_backfill_scan_min,
+ cct->_conf->osd_backfill_scan_min,
cct->_conf->osd_backfill_scan_max, &bi, handle);
MOSDPGScan *reply = new MOSDPGScan(MOSDPGScan::OP_SCAN_DIGEST,
get_osdmap()->get_epoch(), m->query_epoch,
@@ -1395,9 +1592,9 @@ void ReplicatedPG::do_scan(
}
}
-void ReplicatedPG::_do_push(OpRequestRef op)
+void ReplicatedBackend::_do_push(OpRequestRef op)
{
- MOSDPGPush *m = static_cast<MOSDPGPush *>(op->request);
+ MOSDPGPush *m = static_cast<MOSDPGPush *>(op->get_req());
assert(m->get_header().type == MSG_OSD_PG_PUSH);
int from = m->get_source().num();
@@ -1412,52 +1609,89 @@ void ReplicatedPG::_do_push(OpRequestRef op)
MOSDPGPushReply *reply = new MOSDPGPushReply;
reply->set_priority(m->get_priority());
- reply->pgid = info.pgid;
+ reply->pgid = get_info().pgid;
reply->map_epoch = m->map_epoch;
reply->replies.swap(replies);
reply->compute_cost(cct);
- t->register_on_complete(new C_OSD_SendMessageOnConn(
- osd, reply, m->get_connection()));
+ t->register_on_complete(
+ new C_OSD_SendMessageOnConn(
+ osd, reply, m->get_connection()));
- osd->store->queue_transaction(osr.get(), t);
+ get_parent()->queue_transaction(t);
}
-void ReplicatedPG::_do_pull_response(OpRequestRef op)
+struct C_ReplicatedBackend_OnPullComplete : GenContext<ThreadPool::TPHandle&> {
+ ReplicatedBackend *bc;
+ list<ObjectContextRef> to_continue;
+ int priority;
+ C_ReplicatedBackend_OnPullComplete(ReplicatedBackend *bc, int priority)
+ : bc(bc), priority(priority) {}
+
+ void finish(ThreadPool::TPHandle &handle) {
+ ReplicatedBackend::RPGHandle *h = bc->_open_recovery_op();
+ for (list<ObjectContextRef>::iterator i =
+ to_continue.begin();
+ i != to_continue.end();
+ ++i) {
+ if (!bc->start_pushes((*i)->obs.oi.soid, *i, h)) {
+ bc->get_parent()->on_global_recover(
+ (*i)->obs.oi.soid);
+ }
+ handle.reset_tp_timeout();
+ }
+ bc->run_recovery_op(h, priority);
+ }
+};
+
+void ReplicatedBackend::_do_pull_response(OpRequestRef op)
{
- MOSDPGPush *m = static_cast<MOSDPGPush *>(op->request);
+ MOSDPGPush *m = static_cast<MOSDPGPush *>(op->get_req());
assert(m->get_header().type == MSG_OSD_PG_PUSH);
int from = m->get_source().num();
vector<PullOp> replies(1);
ObjectStore::Transaction *t = new ObjectStore::Transaction;
+ list<ObjectContextRef> to_continue;
for (vector<PushOp>::iterator i = m->pushes.begin();
i != m->pushes.end();
++i) {
- bool more = handle_pull_response(from, *i, &(replies.back()), t);
+ bool more = handle_pull_response(from, *i, &(replies.back()), &to_continue, t);
if (more)
replies.push_back(PullOp());
}
+ if (!to_continue.empty()) {
+ C_ReplicatedBackend_OnPullComplete *c =
+ new C_ReplicatedBackend_OnPullComplete(
+ this,
+ m->get_priority());
+ c->to_continue.swap(to_continue);
+ t->register_on_complete(
+ new C_QueueInWQ(
+ &osd->push_wq,
+ get_parent()->bless_gencontext(c)));
+ }
replies.erase(replies.end() - 1);
if (replies.size()) {
MOSDPGPull *reply = new MOSDPGPull;
reply->set_priority(m->get_priority());
- reply->pgid = info.pgid;
+ reply->pgid = get_info().pgid;
reply->map_epoch = m->map_epoch;
reply->pulls.swap(replies);
reply->compute_cost(cct);
- t->register_on_complete(new C_OSD_SendMessageOnConn(
- osd, reply, m->get_connection()));
+ t->register_on_complete(
+ new C_OSD_SendMessageOnConn(
+ osd, reply, m->get_connection()));
}
- osd->store->queue_transaction(osr.get(), t);
+ get_parent()->queue_transaction(t);
}
-void ReplicatedPG::do_pull(OpRequestRef op)
+void ReplicatedBackend::do_pull(OpRequestRef op)
{
- MOSDPGPull *m = static_cast<MOSDPGPull *>(op->request);
+ MOSDPGPull *m = static_cast<MOSDPGPull *>(op->get_req());
assert(m->get_header().type == MSG_OSD_PG_PULL);
int from = m->get_source().num();
@@ -1471,9 +1705,9 @@ void ReplicatedPG::do_pull(OpRequestRef op)
send_pushes(m->get_priority(), replies);
}
-void ReplicatedPG::do_push_reply(OpRequestRef op)
+void ReplicatedBackend::do_push_reply(OpRequestRef op)
{
- MOSDPGPushReply *m = static_cast<MOSDPGPushReply *>(op->request);
+ MOSDPGPushReply *m = static_cast<MOSDPGPushReply *>(op->get_req());
assert(m->get_header().type == MSG_OSD_PG_PUSH_REPLY);
int from = m->get_source().num();
@@ -1494,7 +1728,7 @@ void ReplicatedPG::do_push_reply(OpRequestRef op)
void ReplicatedPG::do_backfill(OpRequestRef op)
{
- MOSDPGBackfill *m = static_cast<MOSDPGBackfill*>(op->request);
+ MOSDPGBackfill *m = static_cast<MOSDPGBackfill*>(op->get_req());
assert(m->get_header().type == MSG_OSD_PG_BACKFILL);
dout(10) << "do_backfill " << *m << dendl;
@@ -2158,7 +2392,7 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
ObjectContextRef src_obc;
if (ceph_osd_op_type_multi(op.op)) {
- MOSDOp *m = static_cast<MOSDOp *>(ctx->op->request);
+ MOSDOp *m = static_cast<MOSDOp *>(ctx->op->get_req());
object_locator_t src_oloc;
get_src_oloc(soid.oid, m->get_object_locator(), src_oloc);
hobject_t src_oid(osd_op.soid, src_oloc.key, soid.hash,
@@ -2408,13 +2642,35 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
}
break;
+ case CEPH_OSD_OP_ISDIRTY:
+ ++ctx->num_read;
+ {
+ bool is_dirty = obs.oi.is_dirty();
+ ::encode(is_dirty, osd_op.outdata);
+ ctx->delta_stats.num_rd++;
+ result = 0;
+ }
+ break;
+
+ case CEPH_OSD_OP_UNDIRTY:
+ ++ctx->num_write;
+ {
+ ctx->undirty = true; // see make_writeable()
+ ctx->modify = true;
+ ctx->delta_stats.num_wr++;
+ }
+ break;
+
case CEPH_OSD_OP_GETXATTR:
++ctx->num_read;
{
string aname;
bp.copy(op.xattr.name_len, aname);
string name = "_" + aname;
- int r = osd->store->getattr(coll, soid, name.c_str(), osd_op.outdata);
+ int r = pgbackend->objects_get_attr(
+ soid,
+ name,
+ &(osd_op.outdata));
if (r >= 0) {
op.xattr.value_len = r;
result = 0;
@@ -2457,9 +2713,15 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
bufferlist xattr;
if (op.op == CEPH_OSD_OP_CMPXATTR)
- result = osd->store->getattr(coll, soid, name.c_str(), xattr);
+ result = pgbackend->objects_get_attr(
+ soid,
+ name,
+ &xattr);
else
- result = osd->store->getattr(coll, src_obc->obs.oi.soid, name.c_str(), xattr);
+ result = pgbackend->objects_get_attr(
+ src_obc->obs.oi.soid,
+ name,
+ &xattr);
if (result < 0 && result != -EEXIST && result != -ENODATA)
break;
@@ -2523,8 +2785,8 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
result = -ERANGE;
else if (ver > oi.user_version)
result = -EOVERFLOW;
- break;
}
+ break;
case CEPH_OSD_OP_LIST_WATCHERS:
++ctx->num_read;
@@ -2707,6 +2969,9 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
op.extent.length = (op.extent.offset > oi.size ? 0 : oi.size - op.extent.offset);
dout(10) << " old truncate_seq " << op.extent.truncate_seq << " < current " << seq
<< ", adjusting write length to " << op.extent.length << dendl;
+ bufferlist t;
+ t.substr_of(osd_op.indata, 0, op.extent.length);
+ osd_op.indata.swap(t);
}
if (op.extent.truncate_seq > seq) {
// write arrives before trimtrunc
@@ -2829,7 +3094,7 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
}
}
break;
-
+
case CEPH_OSD_OP_TRIMTRUNC:
op.extent.offset = op.extent.truncate_size;
// falling through
@@ -2925,10 +3190,10 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
<< " oi.version=" << oi.version.version << " ctx->at_version=" << ctx->at_version << dendl;
dout(10) << "watch: oi.user_version=" << oi.user_version<< dendl;
dout(10) << "watch: peer_addr="
- << ctx->op->request->get_connection()->get_peer_addr() << dendl;
+ << ctx->op->get_req()->get_connection()->get_peer_addr() << dendl;
watch_info_t w(cookie, cct->_conf->osd_client_watch_timeout,
- ctx->op->request->get_connection()->get_peer_addr());
+ ctx->op->get_req()->get_connection()->get_peer_addr());
if (do_watch) {
if (oi.watchers.count(make_pair(cookie, entity))) {
dout(10) << " found existing watch " << w << " by " << entity << dendl;
@@ -2960,7 +3225,8 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
case CEPH_OSD_OP_SETXATTR:
++ctx->num_write;
{
- if (op.xattr.value_len > cct->_conf->osd_max_attr_size) {
+ if (cct->_conf->osd_max_attr_size > 0 &&
+ op.xattr.value_len > cct->_conf->osd_max_attr_size) {
result = -EFBIG;
break;
}
@@ -3058,11 +3324,6 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
}
}
- if (cct->_conf->osd_tmapput_sets_uses_tmap) {
- assert(cct->_conf->osd_auto_upgrade_tmap);
- oi.uses_tmap = true;
- }
-
// write it
vector<OSDOp> nops(1);
OSDOp& newop = nops[0];
@@ -3108,29 +3369,6 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
}
set<string> out_set;
- if (oi.uses_tmap && cct->_conf->osd_auto_upgrade_tmap) {
- dout(20) << "CEPH_OSD_OP_OMAPGETKEYS: "
- << " Reading " << oi.soid << " omap from tmap" << dendl;
- map<string, bufferlist> vals;
- bufferlist header;
- int r = _get_tmap(ctx, &vals, &header);
- if (r == 0) {
- map<string, bufferlist>::iterator iter =
- vals.upper_bound(start_after);
- for (uint64_t i = 0;
- i < max_return && iter != vals.end();
- ++i, iter++) {
- out_set.insert(iter->first);
- }
- ::encode(out_set, osd_op.outdata);
- ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
- ctx->delta_stats.num_rd++;
- break;
- }
- dout(10) << "failed, reading from omap" << dendl;
- // No valid tmap, use omap
- }
-
{
ObjectMap::ObjectMapIterator iter = osd->store->get_omap_iterator(
coll, soid
@@ -3166,30 +3404,6 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
}
map<string, bufferlist> out_set;
- if (oi.uses_tmap && cct->_conf->osd_auto_upgrade_tmap) {
- dout(20) << "CEPH_OSD_OP_OMAPGETVALS: "
- << " Reading " << oi.soid << " omap from tmap" << dendl;
- map<string, bufferlist> vals;
- bufferlist header;
- int r = _get_tmap(ctx, &vals, &header);
- if (r == 0) {
- map<string, bufferlist>::iterator iter = vals.upper_bound(start_after);
- if (filter_prefix > start_after) iter = vals.lower_bound(filter_prefix);
- for (uint64_t i = 0;
- i < max_return && iter != vals.end() &&
- iter->first.substr(0, filter_prefix.size()) == filter_prefix;
- ++i, iter++) {
- out_set.insert(*iter);
- }
- ::encode(out_set, osd_op.outdata);
- ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
- ctx->delta_stats.num_rd++;
- break;
- }
- // No valid tmap, use omap
- dout(10) << "failed, reading from omap" << dendl;
- }
-
{
ObjectMap::ObjectMapIterator iter = osd->store->get_omap_iterator(
coll, soid
@@ -3217,19 +3431,6 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
case CEPH_OSD_OP_OMAPGETHEADER:
++ctx->num_read;
{
- if (oi.uses_tmap && cct->_conf->osd_auto_upgrade_tmap) {
- dout(20) << "CEPH_OSD_OP_OMAPGETHEADER: "
- << " Reading " << oi.soid << " omap from tmap" << dendl;
- map<string, bufferlist> vals;
- bufferlist header;
- int r = _get_tmap(ctx, &vals, &header);
- if (r == 0) {
- osd_op.outdata.claim(header);
- break;
- }
- // No valid tmap, fall through to omap
- dout(10) << "failed, reading from omap" << dendl;
- }
osd->store->omap_get_header(coll, soid, &osd_op.outdata);
ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
ctx->delta_stats.num_rd++;
@@ -3248,28 +3449,6 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
goto fail;
}
map<string, bufferlist> out;
- if (oi.uses_tmap && cct->_conf->osd_auto_upgrade_tmap) {
- dout(20) << "CEPH_OSD_OP_OMAPGET: "
- << " Reading " << oi.soid << " omap from tmap" << dendl;
- map<string, bufferlist> vals;
- bufferlist header;
- int r = _get_tmap(ctx, &vals, &header);
- if (r == 0) {
- for (set<string>::iterator iter = keys_to_get.begin();
- iter != keys_to_get.end();
- ++iter) {
- if (vals.count(*iter)) {
- out.insert(*(vals.find(*iter)));
- }
- }
- ::encode(out, osd_op.outdata);
- ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
- ctx->delta_stats.num_rd++;
- break;
- }
- // No valid tmap, use omap
- dout(10) << "failed, reading from omap" << dendl;
- }
osd->store->omap_get_values(coll, soid, keys_to_get, &out);
::encode(out, osd_op.outdata);
ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
@@ -3347,9 +3526,6 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
case CEPH_OSD_OP_OMAPSETVALS:
++ctx->num_write;
{
- if (oi.uses_tmap && cct->_conf->osd_auto_upgrade_tmap) {
- _copy_up_tmap(ctx);
- }
if (!obs.exists) {
ctx->delta_stats.num_objects++;
obs.exists = true;
@@ -3377,9 +3553,6 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
case CEPH_OSD_OP_OMAPSETHEADER:
++ctx->num_write;
{
- if (oi.uses_tmap && cct->_conf->osd_auto_upgrade_tmap) {
- _copy_up_tmap(ctx);
- }
if (!obs.exists) {
ctx->delta_stats.num_objects++;
obs.exists = true;
@@ -3397,9 +3570,6 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
result = -ENOENT;
break;
}
- if (oi.uses_tmap && cct->_conf->osd_auto_upgrade_tmap) {
- _copy_up_tmap(ctx);
- }
t.touch(coll, soid);
t.omap_clear(coll, soid);
ctx->delta_stats.num_wr++;
@@ -3413,9 +3583,6 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
result = -ENOENT;
break;
}
- if (oi.uses_tmap && cct->_conf->osd_auto_upgrade_tmap) {
- _copy_up_tmap(ctx);
- }
t.touch(coll, soid);
set<string> to_rm;
try {
@@ -3525,7 +3692,7 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
result = -EINVAL;
goto fail;
}
- if (!ctx->copy_op) {
+ if (!ctx->copy_cb) {
// start
pg_t raw_pg;
get_osdmap()->object_locator_to_pg(src_name, src_oloc, raw_pg);
@@ -3537,13 +3704,18 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
result = -EINVAL;
break;
}
- result = start_copy(ctx, src, src_oloc, src_version, &ctx->copy_op);
+ hobject_t temp_target = generate_temp_object();
+ CopyFromCallback *cb = new CopyFromCallback(ctx, temp_target);
+ ctx->copy_cb = cb;
+ result = start_copy(cb, ctx->obc, src, src_oloc, src_version,
+ temp_target);
if (result < 0)
goto fail;
result = -EINPROGRESS;
} else {
// finish
- result = finish_copy(ctx);
+ assert(ctx->copy_cb->get_result() >= 0);
+ result = finish_copyfrom(ctx);
}
}
break;
@@ -3589,22 +3761,6 @@ int ReplicatedPG::_get_tmap(OpContext *ctx,
return 0;
}
-int ReplicatedPG::_copy_up_tmap(OpContext *ctx)
-{
- dout(20) << "copying up tmap for " << ctx->new_obs.oi.soid << dendl;
- ctx->new_obs.oi.uses_tmap = false;
- map<string, bufferlist> vals;
- bufferlist header;
- int r = _get_tmap(ctx, &vals, &header);
- if (r < 0)
- return 0;
- ctx->op_t.omap_setkeys(coll, ctx->new_obs.oi.soid,
- vals);
- ctx->op_t.omap_setheader(coll, ctx->new_obs.oi.soid,
- header);
- return 0;
-}
-
inline int ReplicatedPG::_delete_head(OpContext *ctx)
{
SnapSet& snapset = ctx->new_snapset;
@@ -3651,37 +3807,35 @@ int ReplicatedPG::_rollback_to(OpContext *ctx, ceph_osd_op& op)
int ret = find_object_context(
hobject_t(soid.oid, soid.get_key(), snapid, soid.hash, info.pgid.pool(), soid.get_namespace()),
&rollback_to, false, &cloneid);
- if (ret) {
- if (-ENOENT == ret) {
- // there's no snapshot here, or there's no object.
- // if there's no snapshot, we delete the object; otherwise, do nothing.
- dout(20) << "_rollback_to deleting head on " << soid.oid
- << " because got ENOENT on find_object_context" << dendl;
- if (ctx->obc->obs.oi.watchers.size()) {
- // Cannot delete an object with watchers
- ret = -EBUSY;
- } else {
- _delete_head(ctx);
- ret = 0;
- }
- } else if (-EAGAIN == ret) {
- /* a different problem, like degraded pool
- * with not-yet-restored object. We shouldn't have been able
- * to get here; recovery should have completed first! */
- hobject_t rollback_target(soid.oid, soid.get_key(), cloneid, soid.hash,
- info.pgid.pool(), soid.get_namespace());
- assert(is_missing_object(rollback_target));
- dout(20) << "_rollback_to attempted to roll back to a missing object "
- << rollback_target << " (requested snapid: ) " << snapid << dendl;
- wait_for_missing_object(rollback_target, ctx->op);
+ if (ret == -ENOENT || (rollback_to && rollback_to->obs.oi.is_whiteout())) {
+ // there's no snapshot here, or there's no object.
+ // if there's no snapshot, we delete the object; otherwise, do nothing.
+ dout(20) << "_rollback_to deleting head on " << soid.oid
+ << " because got ENOENT|whiteout on find_object_context" << dendl;
+ if (ctx->obc->obs.oi.watchers.size()) {
+ // Cannot delete an object with watchers
+ ret = -EBUSY;
} else {
- // ummm....huh? It *can't* return anything else at time of writing.
- assert(0);
- }
+ _delete_head(ctx);
+ ret = 0;
+ }
+ } else if (-EAGAIN == ret) {
+ /* a different problem, like degraded pool
+ * with not-yet-restored object. We shouldn't have been able
+ * to get here; recovery should have completed first! */
+ hobject_t rollback_target(soid.oid, soid.get_key(), cloneid, soid.hash,
+ info.pgid.pool(), soid.get_namespace());
+ assert(is_missing_object(rollback_target));
+ dout(20) << "_rollback_to attempted to roll back to a missing object "
+ << rollback_target << " (requested snapid: ) " << snapid << dendl;
+ wait_for_missing_object(rollback_target, ctx->op);
+ } else if (ret) {
+ // ummm....huh? It *can't* return anything else at time of writing.
+ assert(0 == "unexpected error code in _rollback_to");
} else { //we got our context, let's use it to do the rollback!
hobject_t& rollback_to_sobject = rollback_to->obs.oi.soid;
if (is_degraded_object(rollback_to_sobject)) {
- dout(20) << "_rollback_to attempted to roll back to a degraded object "
+ dout(20) << "_rollback_to attempted to roll back to a degraded object "
<< rollback_to_sobject << " (requested snapid: ) " << snapid << dendl;
wait_for_degraded_object(rollback_to_sobject, ctx->op);
ret = -EAGAIN;
@@ -3757,6 +3911,15 @@ void ReplicatedPG::make_writeable(OpContext *ctx)
dout(20) << "make_writeable " << soid << " snapset=" << ctx->snapset
<< " snapc=" << snapc << dendl;;
+ // we will mark the object dirty
+ if (ctx->undirty) {
+ dout(20) << " clearing DIRTY flag" << dendl;
+ ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
+ } else {
+ dout(20) << " setting DIRTY flag" << dendl;
+ ctx->new_obs.oi.set_flag(object_info_t::FLAG_DIRTY);
+ }
+
// use newer snapc?
if (ctx->new_snapset.seq > snapc.seq) {
snapc.seq = ctx->new_snapset.seq;
@@ -3875,7 +4038,7 @@ void ReplicatedPG::add_interval_usage(interval_set<uint64_t>& s, object_stat_sum
void ReplicatedPG::do_osd_op_effects(OpContext *ctx)
{
- ConnectionRef conn(ctx->op->request->get_connection());
+ ConnectionRef conn(ctx->op->get_req()->get_connection());
boost::intrusive_ptr<OSD::Session> session(
(OSD::Session *)conn->get_priv());
session->put(); // get_priv() takes a ref, and so does the intrusive_ptr
@@ -3967,19 +4130,9 @@ void ReplicatedPG::do_osd_op_effects(OpContext *ctx)
}
}
-bool ReplicatedPG::have_temp_coll()
-{
- return temp_created || osd->store->collection_exists(temp_coll);
-}
-
coll_t ReplicatedPG::get_temp_coll(ObjectStore::Transaction *t)
{
- if (temp_created)
- return temp_coll;
- if (!osd->store->collection_exists(temp_coll))
- t->create_collection(temp_coll);
- temp_created = true;
- return temp_coll;
+ return pgbackend->get_temp_coll(t);
}
hobject_t ReplicatedPG::generate_temp_object()
@@ -3987,6 +4140,7 @@ hobject_t ReplicatedPG::generate_temp_object()
ostringstream ss;
ss << "temp_" << info.pgid << "_" << get_role() << "_" << osd->monc->get_global_id() << "_" << (++temp_seq);
hobject_t hoid(object_t(ss.str()), "", CEPH_NOSNAP, 0, -1, "");
+ pgbackend->add_temp_obj(hoid);
dout(20) << __func__ << " " << hoid << dendl;
return hoid;
}
@@ -4158,12 +4312,12 @@ struct C_Copyfrom : public Context {
}
};
-int ReplicatedPG::start_copy(OpContext *ctx,
+int ReplicatedPG::start_copy(CopyCallback *cb, ObjectContextRef obc,
hobject_t src, object_locator_t oloc, version_t version,
- CopyOpRef *pcop)
+ const hobject_t& temp_dest_oid)
{
- const hobject_t& dest = ctx->obs->oi.soid;
- dout(10) << __func__ << " " << dest << " ctx " << ctx
+ const hobject_t& dest = obc->obs.oi.soid;
+ dout(10) << __func__ << " " << dest
<< " from " << src << " " << oloc << " v" << version
<< dendl;
@@ -4175,19 +4329,18 @@ int ReplicatedPG::start_copy(OpContext *ctx,
cancel_copy(cop);
}
- CopyOpRef cop(new CopyOp(ctx, src, oloc, version));
+ CopyOpRef cop(new CopyOp(cb, obc, src, oloc, version, temp_dest_oid));
copy_ops[dest] = cop;
- ctx->copy_op = cop;
- ++ctx->obc->copyfrom_readside;
+ ++obc->copyfrom_readside;
- _copy_some(ctx, cop);
+ _copy_some(obc, cop);
return 0;
}
-void ReplicatedPG::_copy_some(OpContext *ctx, CopyOpRef cop)
+void ReplicatedPG::_copy_some(ObjectContextRef obc, CopyOpRef cop)
{
- dout(10) << __func__ << " " << ctx << " " << cop << dendl;
+ dout(10) << __func__ << " " << obc << " " << cop << dendl;
ObjectOperation op;
if (cop->version) {
op.assert_version(cop->version);
@@ -4201,7 +4354,7 @@ void ReplicatedPG::_copy_some(OpContext *ctx, CopyOpRef cop)
&cop->data, &cop->omap,
&cop->rval);
- C_Copyfrom *fin = new C_Copyfrom(this, ctx->obs->oi.soid,
+ C_Copyfrom *fin = new C_Copyfrom(this, obc->obs.oi.soid,
get_last_peering_reset());
osd->objecter_lock.Lock();
tid_t tid = osd->objecter->read(cop->src.oid, cop->oloc, op,
@@ -4229,50 +4382,48 @@ void ReplicatedPG::process_copy_chunk(hobject_t oid, tid_t tid, int r)
<< " tid " << cop->objecter_tid << dendl;
return;
}
- OpContext *ctx = cop->ctx;
+ ObjectContextRef obc = cop->obc;
cop->objecter_tid = 0;
- if (r < 0) {
- copy_ops.erase(ctx->obc->obs.oi.soid);
- --ctx->obc->copyfrom_readside;
- kick_object_context_blocked(ctx->obc);
- reply_ctx(ctx, r);
- return;
- }
- assert(cop->rval >= 0);
-
- if (!cop->cursor.is_complete()) {
- // write out what we have so far
- vector<OSDOp> ops;
- tid_t rep_tid = osd->get_tid();
- osd_reqid_t reqid(osd->get_cluster_msgr_name(), 0, rep_tid);
- OpContext *tctx = new OpContext(OpRequestRef(), reqid, ops, &ctx->obc->obs, ctx->obc->ssc, this);
- tctx->mtime = ceph_clock_now(g_ceph_context);
- RepGather *repop = new_repop(tctx, ctx->obc, rep_tid);
- if (cop->temp_cursor.is_initial()) {
- cop->temp_coll = get_temp_coll(&tctx->local_t);
- cop->temp_oid = generate_temp_object();
- temp_contents.insert(cop->temp_oid);
- repop->ctx->new_temp_oid = cop->temp_oid;
- }
+ CopyResults results;
+ if (r >= 0) {
+ assert(cop->rval >= 0);
+
+ if (!cop->cursor.is_complete()) {
+ // write out what we have so far
+ vector<OSDOp> ops;
+ tid_t rep_tid = osd->get_tid();
+ osd_reqid_t reqid(osd->get_cluster_msgr_name(), 0, rep_tid);
+ OpContext *tctx = new OpContext(OpRequestRef(), reqid, ops, &obc->obs, obc->ssc, this);
+ tctx->mtime = ceph_clock_now(g_ceph_context);
+ RepGather *repop = new_repop(tctx, obc, rep_tid);
+
+ if (cop->temp_cursor.is_initial()) {
+ cop->temp_coll = get_temp_coll(&tctx->local_t);
+ repop->ctx->new_temp_oid = cop->temp_oid;
+ }
- _write_copy_chunk(cop, &tctx->op_t);
+ _write_copy_chunk(cop, &tctx->op_t);
- issue_repop(repop, repop->ctx->mtime);
- eval_repop(repop);
+ issue_repop(repop, repop->ctx->mtime);
+ eval_repop(repop);
+ repop->put();
- dout(10) << __func__ << " fetching more" << dendl;
- _copy_some(ctx, cop);
- return;
+ dout(10) << __func__ << " fetching more" << dendl;
+ _copy_some(obc, cop);
+ return;
+ }
+ _build_finish_copy_transaction(cop, results.get<3>());
+ results.get<1>() = cop->temp_cursor.data_offset;
}
dout(20) << __func__ << " complete; committing" << dendl;
- execute_ctx(ctx);
+ results.get<0>() = r;
+ cop->cb->complete(results);
- copy_ops.erase(ctx->obc->obs.oi.soid);
- --ctx->obc->copyfrom_readside;
- ctx->copy_op.reset();
- kick_object_context_blocked(ctx->obc);
+ copy_ops.erase(obc->obs.oi.soid);
+ --obc->copyfrom_readside;
+ kick_object_context_blocked(obc);
}
void ReplicatedPG::_write_copy_chunk(CopyOpRef cop, ObjectStore::Transaction *t)
@@ -4299,16 +4450,12 @@ void ReplicatedPG::_write_copy_chunk(CopyOpRef cop, ObjectStore::Transaction *t)
cop->temp_cursor = cop->cursor;
}
-int ReplicatedPG::finish_copy(OpContext *ctx)
+void ReplicatedPG::_build_finish_copy_transaction(CopyOpRef cop,
+ ObjectStore::Transaction& t)
{
- CopyOpRef cop = ctx->copy_op;
- ObjectState& obs = ctx->new_obs;
- ObjectStore::Transaction& t = ctx->op_t;
+ ObjectState& obs = cop->obc->obs;
- if (!obs.exists) {
- ctx->delta_stats.num_objects++;
- obs.exists = true;
- } else {
+ if (obs.exists) {
t.remove(coll, obs.oi.soid);
}
@@ -4321,19 +4468,35 @@ int ReplicatedPG::finish_copy(OpContext *ctx)
// finish writing to temp object, then move into place
_write_copy_chunk(cop, &t);
t.collection_move_rename(cop->temp_coll, cop->temp_oid, coll, obs.oi.soid);
- temp_contents.erase(cop->temp_oid);
- ctx->discard_temp_oid = cop->temp_oid;
+ pgbackend->clear_temp_obj(cop->temp_oid);
+ }
+}
+
+int ReplicatedPG::finish_copyfrom(OpContext *ctx)
+{
+ dout(20) << "finish_copyfrom on " << ctx->obs->oi.soid << dendl;
+ ObjectState& obs = ctx->new_obs;
+ CopyFromCallback *cb = static_cast<CopyFromCallback*>(ctx->copy_cb);
+
+ if (!ctx->obs->exists) {
+ ctx->delta_stats.num_objects++;
+ obs.exists = true;
}
+ if (cb->is_temp_obj_used()) {
+ ctx->discard_temp_oid = cb->temp_obj;
+ }
+ ctx->op_t.swap(cb->results.get<3>());
+ ctx->op_t.append(cb->results.get<3>());
interval_set<uint64_t> ch;
if (obs.oi.size > 0)
ch.insert(0, obs.oi.size);
ctx->modified_ranges.union_of(ch);
- if (cop->cursor.data_offset != obs.oi.size) {
+ if (cb->get_data_size() != obs.oi.size) {
ctx->delta_stats.num_bytes -= obs.oi.size;
+ obs.oi.size = cb->get_data_size();
ctx->delta_stats.num_bytes += obs.oi.size;
- obs.oi.size = cop->cursor.data_offset;
}
ctx->delta_stats.num_wr++;
ctx->delta_stats.num_wr_kb += SHIFT_ROUND_UP(obs.oi.size, 10);
@@ -4343,8 +4506,7 @@ int ReplicatedPG::finish_copy(OpContext *ctx)
void ReplicatedPG::cancel_copy(CopyOpRef cop)
{
- OpContext *ctx = cop->ctx;
- dout(10) << __func__ << " " << ctx->obc->obs.oi.soid << " ctx " << ctx
+ dout(10) << __func__ << " " << cop->obc->obs.oi.soid
<< " from " << cop->src << " " << cop->oloc << " v" << cop->version
<< dendl;
@@ -4354,28 +4516,21 @@ void ReplicatedPG::cancel_copy(CopyOpRef cop)
osd->objecter->op_cancel(cop->objecter_tid);
}
- copy_ops.erase(ctx->obc->obs.oi.soid);
- --ctx->obc->copyfrom_readside;
- ctx->copy_op.reset();
-
- kick_object_context_blocked(ctx->obc);
+ copy_ops.erase(cop->obc->obs.oi.soid);
+ --cop->obc->copyfrom_readside;
- delete ctx;
+ kick_object_context_blocked(cop->obc);
+ bool temp_obj_created = !cop->cursor.is_initial();
+ CopyResults result(-ECANCELED, 0, temp_obj_created, ObjectStore::Transaction());
+ cop->cb->complete(result);
}
-void ReplicatedPG::requeue_cancel_copy_ops(bool requeue)
+void ReplicatedPG::cancel_copy_ops()
{
dout(10) << __func__ << dendl;
- for (map<hobject_t,CopyOpRef>::iterator p = copy_ops.begin();
- p != copy_ops.end();
- copy_ops.erase(p++)) {
- // requeue initiating copy *and* any subsequent waiters
- CopyOpRef cop = p->second;
- if (requeue) {
- cop->waiting.push_front(cop->ctx->op);
- requeue_ops(cop->waiting);
- }
- cancel_copy(cop);
+ map<hobject_t,CopyOpRef>::iterator p = copy_ops.begin();
+ while (p != copy_ops.end()) {
+ cancel_copy((p++)->second);
}
}
@@ -4426,10 +4581,19 @@ void ReplicatedPG::apply_repop(RepGather *repop)
if (repop->ctx->clone_obc)
repop->ctx->clone_obc->ondisk_write_lock();
+ bool unlock_snapset_obc = false;
+ if (repop->ctx->snapset_obc && repop->ctx->snapset_obc->obs.oi.soid !=
+ repop->obc->obs.oi.soid) {
+ repop->ctx->snapset_obc->ondisk_write_lock();
+ unlock_snapset_obc = true;
+ }
+
Context *oncommit = new C_OSD_OpCommit(this, repop);
Context *onapplied = new C_OSD_OpApplied(this, repop);
- Context *onapplied_sync = new C_OSD_OndiskWriteUnlock(repop->obc,
- repop->ctx->clone_obc);
+ Context *onapplied_sync = new C_OSD_OndiskWriteUnlock(
+ repop->obc,
+ repop->ctx->clone_obc,
+ unlock_snapset_obc ? repop->ctx->snapset_obc : ObjectContextRef());
int r = osd->store->queue_transactions(osr.get(), repop->tls, onapplied, oncommit, onapplied_sync, repop->ctx->op);
if (r) {
derr << "apply_repop queue_transactions returned " << r << " on " << *repop << dendl;
@@ -4533,7 +4697,7 @@ void ReplicatedPG::eval_repop(RepGather *repop)
{
MOSDOp *m = NULL;
if (repop->ctx->op)
- m = static_cast<MOSDOp *>(repop->ctx->op->request);
+ m = static_cast<MOSDOp *>(repop->ctx->op->get_req());
if (m)
dout(10) << "eval_repop " << *repop
@@ -4609,7 +4773,7 @@ void ReplicatedPG::eval_repop(RepGather *repop)
for (list<OpRequestRef>::iterator i = waiting_for_ack[repop->v].begin();
i != waiting_for_ack[repop->v].end();
++i) {
- MOSDOp *m = (MOSDOp*)(*i)->request;
+ MOSDOp *m = (MOSDOp*)(*i)->get_req();
MOSDOpReply *reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(), 0);
reply->set_reply_versions(repop->ctx->at_version,
repop->ctx->user_at_version);
@@ -4705,7 +4869,7 @@ void ReplicatedPG::issue_repop(RepGather *repop, utime_t now)
get_osdmap()->get_epoch(),
repop->rep_tid, repop->ctx->at_version);
if (ctx->op &&
- ((static_cast<MOSDOp *>(ctx->op->request))->get_flags() & CEPH_OSD_FLAG_PARALLELEXEC)) {
+ ((static_cast<MOSDOp *>(ctx->op->get_req()))->get_flags() & CEPH_OSD_FLAG_PARALLELEXEC)) {
// replicate original op for parallel execution on replica
assert(0 == "broken implementation, do not use");
}
@@ -4746,7 +4910,7 @@ ReplicatedPG::RepGather *ReplicatedPG::new_repop(OpContext *ctx, ObjectContextRe
tid_t rep_tid)
{
if (ctx->op)
- dout(10) << "new_repop rep_tid " << rep_tid << " on " << *ctx->op->request << dendl;
+ dout(10) << "new_repop rep_tid " << rep_tid << " on " << *ctx->op->get_req() << dendl;
else
dout(10) << "new_repop rep_tid " << rep_tid << " (no op)" << dendl;
@@ -4777,7 +4941,7 @@ void ReplicatedPG::repop_ack(RepGather *repop, int result, int ack_type,
MOSDOp *m = NULL;
if (repop->ctx->op)
- m = static_cast<MOSDOp *>(repop->ctx->op->request);
+ m = static_cast<MOSDOp *>(repop->ctx->op->get_req());
if (m)
dout(7) << "repop_ack rep_tid " << repop->rep_tid << " op " << *m
@@ -4886,7 +5050,8 @@ void ReplicatedPG::check_blacklisted_obc_watchers(ObjectContextRef obc)
void ReplicatedPG::populate_obc_watchers(ObjectContextRef obc)
{
assert(is_active());
- assert(!is_missing_object(obc->obs.oi.soid) ||
+ assert((recovering.count(obc->obs.oi.soid) ||
+ !is_missing_object(obc->obs.oi.soid)) ||
(pg_log.get_log().objects.count(obc->obs.oi.soid) && // or this is a revert... see recover_primary()
pg_log.get_log().objects.find(obc->obs.oi.soid)->second->op ==
pg_log_entry_t::LOST_REVERT &&
@@ -4979,6 +5144,7 @@ void ReplicatedPG::handle_watch_timeout(WatchRef watch)
// obc ref swallowed by repop!
issue_repop(repop, repop->ctx->mtime);
eval_repop(repop);
+ repop->put();
}
ObjectContextRef ReplicatedPG::create_object_context(const object_info_t& oi,
@@ -4998,23 +5164,37 @@ ObjectContextRef ReplicatedPG::create_object_context(const object_info_t& oi,
}
ObjectContextRef ReplicatedPG::get_object_context(const hobject_t& soid,
- bool can_create)
-{
+ bool can_create,
+ map<string, bufferptr> *attrs)
+{
+ assert(
+ attrs || !pg_log.get_missing().is_missing(soid) ||
+ // or this is a revert... see recover_primary()
+ (pg_log.get_log().objects.count(soid) &&
+ pg_log.get_log().objects.find(soid)->second->op ==
+ pg_log_entry_t::LOST_REVERT));
ObjectContextRef obc = object_contexts.lookup(soid);
if (obc) {
dout(10) << "get_object_context " << obc << " " << soid << dendl;
} else {
// check disk
bufferlist bv;
- int r = osd->store->getattr(coll, soid, OI_ATTR, bv);
- if (r < 0) {
- if (!can_create)
- return ObjectContextRef(); // -ENOENT!
-
- // new object.
- object_info_t oi(soid);
- SnapSetContext *ssc = get_snapset_context(soid.oid, soid.get_key(), soid.hash, true, soid.get_namespace());
- return create_object_context(oi, ssc);
+ if (attrs) {
+ assert(attrs->count(OI_ATTR));
+ bv.push_back(attrs->find(OI_ATTR)->second);
+ } else {
+ int r = pgbackend->objects_get_attr(soid, OI_ATTR, &bv);
+ if (r < 0) {
+ if (!can_create)
+ return ObjectContextRef(); // -ENOENT!
+
+ // new object.
+ object_info_t oi(soid);
+ SnapSetContext *ssc = get_snapset_context(
+ soid.oid, soid.get_key(), soid.hash, true, soid.get_namespace(),
+ soid.has_snapset() ? attrs : 0);
+ return create_object_context(oi, ssc);
+ }
}
object_info_t oi(bv);
@@ -5026,10 +5206,11 @@ ObjectContextRef ReplicatedPG::get_object_context(const hobject_t& soid,
obc->obs.oi = oi;
obc->obs.exists = true;
- if (can_create) {
- obc->ssc = get_snapset_context(soid.oid, soid.get_key(), soid.hash, true, soid.get_namespace());
- register_snapset_context(obc->ssc);
- }
+ obc->ssc = get_snapset_context(
+ soid.oid, soid.get_key(), soid.hash,
+ true, soid.get_namespace(),
+ soid.has_snapset() ? attrs : 0);
+ register_snapset_context(obc->ssc);
populate_obc_watchers(obc);
dout(10) << "get_object_context " << obc << " " << soid << " 0 -> 1 read " << obc->obs.oi << dendl;
@@ -5233,10 +5414,10 @@ void ReplicatedPG::kick_object_context_blocked(ObjectContextRef obc)
return;
}
- list<OpRequestRef>& ls = waiting_for_blocked_object[soid];
+ list<OpRequestRef>& ls = p->second;
dout(10) << __func__ << " " << soid << " requeuing " << ls.size() << " requests" << dendl;
requeue_ops(ls);
- waiting_for_blocked_object.erase(soid);
+ waiting_for_blocked_object.erase(p);
}
SnapSetContext *ReplicatedPG::create_snapset_context(const object_t& oid)
@@ -5248,11 +5429,13 @@ SnapSetContext *ReplicatedPG::create_snapset_context(const object_t& oid)
return ssc;
}
-SnapSetContext *ReplicatedPG::get_snapset_context(const object_t& oid,
- const string& key,
- ps_t seed,
- bool can_create,
- const string& nspace)
+SnapSetContext *ReplicatedPG::get_snapset_context(
+ const object_t& oid,
+ const string& key,
+ ps_t seed,
+ bool can_create,
+ const string& nspace,
+ map<string, bufferptr> *attrs)
{
Mutex::Locker l(snapset_contexts_lock);
SnapSetContext *ssc;
@@ -5261,20 +5444,25 @@ SnapSetContext *ReplicatedPG::get_snapset_context(const object_t& oid,
ssc = p->second;
} else {
bufferlist bv;
- hobject_t head(oid, key, CEPH_NOSNAP, seed,
- info.pgid.pool(), nspace);
- int r = osd->store->getattr(coll, head, SS_ATTR, bv);
- if (r < 0) {
- // try _snapset
- hobject_t snapdir(oid, key, CEPH_SNAPDIR, seed,
- info.pgid.pool(), nspace);
- r = osd->store->getattr(coll, snapdir, SS_ATTR, bv);
- if (r < 0 && !can_create)
- return NULL;
+ if (!attrs) {
+ hobject_t head(oid, key, CEPH_NOSNAP, seed,
+ info.pgid.pool(), nspace);
+ int r = pgbackend->objects_get_attr(head, SS_ATTR, &bv);
+ if (r < 0) {
+ // try _snapset
+ hobject_t snapdir(oid, key, CEPH_SNAPDIR, seed,
+ info.pgid.pool(), nspace);
+ r = pgbackend->objects_get_attr(snapdir, SS_ATTR, &bv);
+ if (r < 0 && !can_create)
+ return NULL;
+ }
+ } else {
+ assert(attrs->count(SS_ATTR));
+ bv.push_back(attrs->find(SS_ATTR)->second);
}
ssc = new SnapSetContext(oid);
_register_snapset_context(ssc);
- if (r >= 0) {
+ if (bv.length()) {
bufferlist::iterator bvp = bv.begin();
ssc->snapset.decode(bvp);
}
@@ -5299,7 +5487,7 @@ void ReplicatedPG::put_snapset_context(SnapSetContext *ssc)
void ReplicatedPG::sub_op_modify(OpRequestRef op)
{
- MOSDSubOp *m = static_cast<MOSDSubOp*>(op->request);
+ MOSDSubOp *m = static_cast<MOSDSubOp*>(op->get_req());
assert(m->get_header().type == MSG_OSD_SUBOP);
const hobject_t& soid = m->poid;
@@ -5350,12 +5538,12 @@ void ReplicatedPG::sub_op_modify(OpRequestRef op)
if (m->new_temp_oid != hobject_t()) {
dout(20) << __func__ << " start tracking temp " << m->new_temp_oid << dendl;
- temp_contents.insert(m->new_temp_oid);
+ pgbackend->add_temp_obj(m->new_temp_oid);
get_temp_coll(&rm->localt);
}
if (m->discard_temp_oid != hobject_t()) {
dout(20) << __func__ << " stop tracking temp " << m->discard_temp_oid << dendl;
- temp_contents.erase(m->discard_temp_oid);
+ pgbackend->clear_temp_obj(m->discard_temp_oid);
}
::decode(rm->opt, p);
@@ -5418,8 +5606,8 @@ void ReplicatedPG::sub_op_modify_applied(RepModify *rm)
rm->applied = true;
if (!pg_has_reset_since(rm->epoch_started)) {
- dout(10) << "sub_op_modify_applied on " << rm << " op " << *rm->op->request << dendl;
- MOSDSubOp *m = static_cast<MOSDSubOp*>(rm->op->request);
+ dout(10) << "sub_op_modify_applied on " << rm << " op " << *rm->op->get_req() << dendl;
+ MOSDSubOp *m = static_cast<MOSDSubOp*>(rm->op->get_req());
assert(m->get_header().type == MSG_OSD_SUBOP);
if (!rm->committed) {
@@ -5441,7 +5629,7 @@ void ReplicatedPG::sub_op_modify_applied(RepModify *rm)
}
}
} else {
- dout(10) << "sub_op_modify_applied on " << rm << " op " << *rm->op->request
+ dout(10) << "sub_op_modify_applied on " << rm << " op " << *rm->op->get_req()
<< " from epoch " << rm->epoch_started << " < last_peering_reset "
<< last_peering_reset << dendl;
}
@@ -5463,24 +5651,24 @@ void ReplicatedPG::sub_op_modify_commit(RepModify *rm)
if (!pg_has_reset_since(rm->epoch_started)) {
// send commit.
- dout(10) << "sub_op_modify_commit on op " << *rm->op->request
+ dout(10) << "sub_op_modify_commit on op " << *rm->op->get_req()
<< ", sending commit to osd." << rm->ackerosd
<< dendl;
if (get_osdmap()->is_up(rm->ackerosd)) {
last_complete_ondisk = rm->last_complete;
- MOSDSubOpReply *commit = new MOSDSubOpReply(static_cast<MOSDSubOp*>(rm->op->request), 0, get_osdmap()->get_epoch(), CEPH_OSD_FLAG_ONDISK);
+ MOSDSubOpReply *commit = new MOSDSubOpReply(static_cast<MOSDSubOp*>(rm->op->get_req()), 0, get_osdmap()->get_epoch(), CEPH_OSD_FLAG_ONDISK);
commit->set_last_complete_ondisk(rm->last_complete);
commit->set_priority(CEPH_MSG_PRIO_HIGH); // this better match ack priority!
osd->send_message_osd_cluster(rm->ackerosd, commit, get_osdmap()->get_epoch());
}
} else {
- dout(10) << "sub_op_modify_commit " << rm << " op " << *rm->op->request
+ dout(10) << "sub_op_modify_commit " << rm << " op " << *rm->op->get_req()
<< " from epoch " << rm->epoch_started << " < last_peering_reset "
<< last_peering_reset << dendl;
}
- log_subop_stats(rm->op, l_osd_sop_w_inb, l_osd_sop_w_lat);
+ log_subop_stats(osd, rm->op, l_osd_sop_w_inb, l_osd_sop_w_lat);
bool done = rm->applied && rm->committed;
unlock();
if (done) {
@@ -5492,7 +5680,7 @@ void ReplicatedPG::sub_op_modify_commit(RepModify *rm)
void ReplicatedPG::sub_op_modify_reply(OpRequestRef op)
{
- MOSDSubOpReply *r = static_cast<MOSDSubOpReply*>(op->request);
+ MOSDSubOpReply *r = static_cast<MOSDSubOpReply*>(op->get_req());
assert(r->get_header().type == MSG_OSD_SUBOPREPLY);
op->mark_started();
@@ -5521,11 +5709,12 @@ void ReplicatedPG::sub_op_modify_reply(OpRequestRef op)
// ===========================================================
-void ReplicatedPG::calc_head_subsets(ObjectContextRef obc, SnapSet& snapset, const hobject_t& head,
- pg_missing_t& missing,
- const hobject_t &last_backfill,
- interval_set<uint64_t>& data_subset,
- map<hobject_t, interval_set<uint64_t> >& clone_subsets)
+void ReplicatedBackend::calc_head_subsets(
+ ObjectContextRef obc, SnapSet& snapset, const hobject_t& head,
+ const pg_missing_t& missing,
+ const hobject_t &last_backfill,
+ interval_set<uint64_t>& data_subset,
+ map<hobject_t, interval_set<uint64_t> >& clone_subsets)
{
dout(10) << "calc_head_subsets " << head
<< " clone_overlap " << snapset.clone_overlap << dendl;
@@ -5575,11 +5764,12 @@ void ReplicatedPG::calc_head_subsets(ObjectContextRef obc, SnapSet& snapset, con
<< " clone_subsets " << clone_subsets << dendl;
}
-void ReplicatedPG::calc_clone_subsets(SnapSet& snapset, const hobject_t& soid,
- const pg_missing_t& missing,
- const hobject_t &last_backfill,
- interval_set<uint64_t>& data_subset,
- map<hobject_t, interval_set<uint64_t> >& clone_subsets)
+void ReplicatedBackend::calc_clone_subsets(
+ SnapSet& snapset, const hobject_t& soid,
+ const pg_missing_t& missing,
+ const hobject_t &last_backfill,
+ interval_set<uint64_t>& data_subset,
+ map<hobject_t, interval_set<uint64_t> >& clone_subsets)
{
dout(10) << "calc_clone_subsets " << soid
<< " clone_overlap " << snapset.clone_overlap << dendl;
@@ -5664,95 +5854,69 @@ void ReplicatedPG::calc_clone_subsets(SnapSet& snapset, const hobject_t& soid,
*/
enum { PULL_NONE, PULL_OTHER, PULL_YES };
-int ReplicatedPG::prepare_pull(
- const hobject_t& soid, eversion_t v,
- int priority,
- map<int, vector<PullOp> > *pulls)
-{
+void ReplicatedBackend::prepare_pull(
+ const hobject_t& soid,
+ ObjectContextRef headctx,
+ RPGHandle *h)
+{
+ assert(get_parent()->get_local_missing().missing.count(soid));
+ eversion_t v = get_parent()->get_local_missing().missing.find(
+ soid)->second.need;
+ const map<hobject_t, set<int> > &missing_loc(
+ get_parent()->get_missing_loc());
+ const map<int, pg_missing_t > &peer_missing(
+ get_parent()->get_peer_missing());
int fromosd = -1;
- map<hobject_t,set<int> >::iterator q = missing_loc.find(soid);
- if (q != missing_loc.end()) {
- // randomize the list of possible sources
- // should we take weights into account?
- vector<int> shuffle(q->second.begin(), q->second.end());
- random_shuffle(shuffle.begin(), shuffle.end());
- for (vector<int>::iterator p = shuffle.begin();
- p != shuffle.end();
- ++p) {
- if (get_osdmap()->is_up(*p)) {
- fromosd = *p;
- break;
- }
- }
- }
- if (fromosd < 0) {
- dout(7) << "pull " << soid
- << " v " << v
- << " but it is unfound" << dendl;
- return PULL_NONE;
- }
+ map<hobject_t,set<int> >::const_iterator q = missing_loc.find(soid);
+ assert(q != missing_loc.end());
+ assert(!q->second.empty());
+
+ // pick a pullee
+ vector<int> shuffle(q->second.begin(), q->second.end());
+ random_shuffle(shuffle.begin(), shuffle.end());
+ vector<int>::iterator p = shuffle.begin();
+ assert(get_osdmap()->is_up(*p));
+ fromosd = *p;
+ assert(fromosd >= 0);
+
+ dout(7) << "pull " << soid
+ << "v " << v
+ << " on osds " << *p
+ << " from osd." << fromosd
+ << dendl;
assert(peer_missing.count(fromosd));
- if (peer_missing[fromosd].is_missing(soid, v)) {
- assert(peer_missing[fromosd].missing[soid].have != v);
+ const pg_missing_t &pmissing = peer_missing.find(fromosd)->second;
+ if (pmissing.is_missing(soid, v)) {
+ assert(pmissing.missing.find(soid)->second.have != v);
dout(10) << "pulling soid " << soid << " from osd " << fromosd
- << " at version " << peer_missing[fromosd].missing[soid].have
+ << " at version " << pmissing.missing.find(soid)->second.have
<< " rather than at version " << v << dendl;
- v = peer_missing[fromosd].missing[soid].have;
- assert(pg_log.get_log().objects.count(soid) &&
- pg_log.get_log().objects.find(soid)->second->op == pg_log_entry_t::LOST_REVERT &&
- pg_log.get_log().objects.find(soid)->second->reverting_to == v);
+ v = pmissing.missing.find(soid)->second.have;
+ assert(get_parent()->get_log().get_log().objects.count(soid) &&
+ (get_parent()->get_log().get_log().objects.find(soid)->second->op ==
+ pg_log_entry_t::LOST_REVERT) &&
+ (get_parent()->get_log().get_log().objects.find(
+ soid)->second->reverting_to ==
+ v));
}
- dout(7) << "pull " << soid
- << " v " << v
- << " on osds " << missing_loc[soid]
- << " from osd." << fromosd
- << dendl;
-
ObjectRecoveryInfo recovery_info;
- // is this a snapped object? if so, consult the snapset.. we may not need the entire object!
- if (soid.snap && soid.snap < CEPH_NOSNAP) {
- // do we have the head and/or snapdir?
- hobject_t head = soid;
- head.snap = CEPH_NOSNAP;
- if (pg_log.get_missing().is_missing(head)) {
- if (pulling.count(head)) {
- dout(10) << " missing but already pulling head " << head << dendl;
- return PULL_NONE;
- } else {
- int r = prepare_pull(
- head, pg_log.get_missing().missing.find(head)->second.need, priority,
- pulls);
- if (r != PULL_NONE)
- return PULL_OTHER;
- return PULL_NONE;
- }
- }
- head.snap = CEPH_SNAPDIR;
- if (pg_log.get_missing().is_missing(head)) {
- if (pulling.count(head)) {
- dout(10) << " missing but already pulling snapdir " << head << dendl;
- return PULL_NONE;
- } else {
- int r = prepare_pull(
- head, pg_log.get_missing().missing.find(head)->second.need, priority,
- pulls);
- if (r != PULL_NONE)
- return PULL_OTHER;
- return PULL_NONE;
- }
- }
-
+ if (soid.is_snap()) {
+ assert(!get_parent()->get_local_missing().is_missing(
+ soid.get_head()) ||
+ !get_parent()->get_local_missing().is_missing(
+ soid.get_snapdir()));
+ assert(headctx);
// check snapset
- SnapSetContext *ssc = get_snapset_context(soid.oid, soid.get_key(), soid.hash, false, soid.get_namespace());
+ SnapSetContext *ssc = headctx->ssc;
assert(ssc);
dout(10) << " snapset " << ssc->snapset << dendl;
- calc_clone_subsets(ssc->snapset, soid, pg_log.get_missing(), info.last_backfill,
+ calc_clone_subsets(ssc->snapset, soid, get_parent()->get_local_missing(),
+ get_info().last_backfill,
recovery_info.copy_subset,
recovery_info.clone_subset);
- put_snapset_context(ssc);
// FIXME: this may overestimate if we are pulling multiple clones in parallel...
dout(10) << " pulling " << recovery_info << dendl;
} else {
@@ -5762,8 +5926,8 @@ int ReplicatedPG::prepare_pull(
recovery_info.size = ((uint64_t)-1);
}
- (*pulls)[fromosd].push_back(PullOp());
- PullOp &op = (*pulls)[fromosd].back();
+ h->pulls[fromosd].push_back(PullOp());
+ PullOp &op = h->pulls[fromosd].back();
op.soid = soid;
op.recovery_info = recovery_info;
@@ -5777,11 +5941,78 @@ int ReplicatedPG::prepare_pull(
assert(!pulling.count(soid));
pull_from_peer[fromosd].insert(soid);
PullInfo &pi = pulling[soid];
+ pi.head_ctx = headctx;
pi.recovery_info = op.recovery_info;
pi.recovery_progress = op.recovery_progress;
- pi.priority = priority;
+}
+
+int ReplicatedPG::recover_missing(
+ const hobject_t &soid, eversion_t v,
+ int priority,
+ PGBackend::RecoveryHandle *h)
+{
+ map<hobject_t,set<int> >::iterator q = missing_loc.find(soid);
+ if (q == missing_loc.end()) {
+ dout(7) << "pull " << soid
+ << " v " << v
+ << " but it is unfound" << dendl;
+ return PULL_NONE;
+ }
+ // is this a snapped object? if so, consult the snapset.. we may not need the entire object!
+ ObjectContextRef obc;
+ ObjectContextRef head_obc;
+ if (soid.snap && soid.snap < CEPH_NOSNAP) {
+ // do we have the head and/or snapdir?
+ hobject_t head = soid.get_head();
+ if (pg_log.get_missing().is_missing(head)) {
+ if (recovering.count(head)) {
+ dout(10) << " missing but already recovering head " << head << dendl;
+ return PULL_NONE;
+ } else {
+ int r = recover_missing(
+ head, pg_log.get_missing().missing.find(head)->second.need, priority,
+ h);
+ if (r != PULL_NONE)
+ return PULL_OTHER;
+ return PULL_NONE;
+ }
+ }
+ head = soid.get_snapdir();
+ if (pg_log.get_missing().is_missing(head)) {
+ if (recovering.count(head)) {
+ dout(10) << " missing but already recovering snapdir " << head << dendl;
+ return PULL_NONE;
+ } else {
+ int r = recover_missing(
+ head, pg_log.get_missing().missing.find(head)->second.need, priority,
+ h);
+ if (r != PULL_NONE)
+ return PULL_OTHER;
+ return PULL_NONE;
+ }
+ }
+
+ // we must have one or the other
+ head_obc = get_object_context(
+ soid.get_head(),
+ false,
+ 0);
+ if (!head_obc)
+ head_obc = get_object_context(
+ soid.get_snapdir(),
+ false,
+ 0);
+ assert(head_obc);
+ }
start_recovery_op(soid);
+ assert(!recovering.count(soid));
+ recovering.insert(soid);
+ pgbackend->recover_object(
+ soid,
+ head_obc,
+ obc,
+ h);
return PULL_YES;
}
@@ -5805,15 +6036,14 @@ void ReplicatedPG::send_remove_op(const hobject_t& oid, eversion_t v, int peer)
* intelligently push an object to a replica. make use of existing
* clones/heads and dup data ranges where possible.
*/
-void ReplicatedPG::prep_push_to_replica(
+void ReplicatedBackend::prep_push_to_replica(
ObjectContextRef obc, const hobject_t& soid, int peer,
- int prio,
PushOp *pop)
{
const object_info_t& oi = obc->obs.oi;
uint64_t size = obc->obs.oi.size;
- dout(10) << __func__ << soid << " v" << oi.version
+ dout(10) << __func__ << ": " << soid << " v" << oi.version
<< " size " << size << " to osd." << peer << dendl;
map<hobject_t, interval_set<uint64_t> > clone_subsets;
@@ -5826,41 +6056,48 @@ void ReplicatedPG::prep_push_to_replica(
// try to base push off of clones that succeed/preceed poid
// we need the head (and current SnapSet) locally to do that.
- if (pg_log.get_missing().is_missing(head)) {
+ if (get_parent()->get_local_missing().is_missing(head)) {
dout(15) << "push_to_replica missing head " << head << ", pushing raw clone" << dendl;
- return prep_push(prio, obc, soid, peer, pop);
+ return prep_push(obc, soid, peer, pop);
}
hobject_t snapdir = head;
snapdir.snap = CEPH_SNAPDIR;
- if (pg_log.get_missing().is_missing(snapdir)) {
- dout(15) << "push_to_replica missing snapdir " << snapdir << ", pushing raw clone" << dendl;
- return prep_push(prio, obc, soid, peer, pop);
+ if (get_parent()->get_local_missing().is_missing(snapdir)) {
+ dout(15) << "push_to_replica missing snapdir " << snapdir
+ << ", pushing raw clone" << dendl;
+ return prep_push(obc, soid, peer, pop);
}
- SnapSetContext *ssc = get_snapset_context(soid.oid, soid.get_key(), soid.hash, false, soid.get_namespace());
+ SnapSetContext *ssc = obc->ssc;
assert(ssc);
dout(15) << "push_to_replica snapset is " << ssc->snapset << dendl;
- calc_clone_subsets(ssc->snapset, soid, peer_missing[peer],
- peer_info[peer].last_backfill,
+ map<int, pg_missing_t>::const_iterator pm =
+ get_parent()->get_peer_missing().find(peer);
+ assert(pm != get_parent()->get_peer_missing().end());
+ map<int, pg_info_t>::const_iterator pi =
+ get_parent()->get_peer_info().find(peer);
+ assert(pi != get_parent()->get_peer_info().end());
+ calc_clone_subsets(ssc->snapset, soid,
+ pm->second,
+ pi->second.last_backfill,
data_subset, clone_subsets);
- put_snapset_context(ssc);
} else if (soid.snap == CEPH_NOSNAP) {
// pushing head or unversioned object.
// base this on partially on replica's clones?
- SnapSetContext *ssc = get_snapset_context(soid.oid, soid.get_key(), soid.hash, false, soid.get_namespace());
+ SnapSetContext *ssc = obc->ssc;
assert(ssc);
dout(15) << "push_to_replica snapset is " << ssc->snapset << dendl;
- calc_head_subsets(obc, ssc->snapset, soid, peer_missing[peer],
- peer_info[peer].last_backfill,
- data_subset, clone_subsets);
- put_snapset_context(ssc);
+ calc_head_subsets(
+ obc,
+ ssc->snapset, soid, get_parent()->get_peer_missing().find(peer)->second,
+ get_parent()->get_peer_info().find(peer)->second.last_backfill,
+ data_subset, clone_subsets);
}
- prep_push(prio, obc, soid, peer, oi.version, data_subset, clone_subsets, pop);
+ prep_push(obc, soid, peer, oi.version, data_subset, clone_subsets, pop);
}
-void ReplicatedPG::prep_push(int prio,
- ObjectContextRef obc,
+void ReplicatedBackend::prep_push(ObjectContextRef obc,
const hobject_t& soid, int peer,
PushOp *pop)
{
@@ -5869,13 +6106,12 @@ void ReplicatedPG::prep_push(int prio,
data_subset.insert(0, obc->obs.oi.size);
map<hobject_t, interval_set<uint64_t> > clone_subsets;
- prep_push(prio, obc, soid, peer,
+ prep_push(obc, soid, peer,
obc->obs.oi.version, data_subset, clone_subsets,
pop);
}
-void ReplicatedPG::prep_push(
- int prio,
+void ReplicatedBackend::prep_push(
ObjectContextRef obc,
const hobject_t& soid, int peer,
eversion_t version,
@@ -5883,9 +6119,10 @@ void ReplicatedPG::prep_push(
map<hobject_t, interval_set<uint64_t> >& clone_subsets,
PushOp *pop)
{
- peer_missing[peer].revise_have(soid, eversion_t());
+ get_parent()->begin_peer_recover(peer, soid);
// take note.
PushInfo &pi = pushing[soid][peer];
+ pi.obc = obc;
pi.recovery_info.size = obc->obs.oi.size;
pi.recovery_info.copy_subset = data_subset;
pi.recovery_info.clone_subset = clone_subsets;
@@ -5896,19 +6133,20 @@ void ReplicatedPG::prep_push(
pi.recovery_progress.data_recovered_to = 0;
pi.recovery_progress.data_complete = 0;
pi.recovery_progress.omap_complete = 0;
- pi.priority = prio;
ObjectRecoveryProgress new_progress;
- build_push_op(pi.recovery_info,
- pi.recovery_progress,
- &new_progress,
- pop);
+ int r = build_push_op(pi.recovery_info,
+ pi.recovery_progress,
+ &new_progress,
+ pop,
+ &(pi.stat));
+ assert(r == 0);
pi.recovery_progress = new_progress;
}
-int ReplicatedPG::send_pull_legacy(int prio, int peer,
- const ObjectRecoveryInfo &recovery_info,
- ObjectRecoveryProgress progress)
+int ReplicatedBackend::send_pull_legacy(int prio, int peer,
+ const ObjectRecoveryInfo &recovery_info,
+ ObjectRecoveryProgress progress)
{
// send op
tid_t tid = osd->get_tid();
@@ -5921,7 +6159,7 @@ int ReplicatedPG::send_pull_legacy(int prio, int peer,
<< " from osd." << peer
<< " tid " << tid << dendl;
- MOSDSubOp *subop = new MOSDSubOp(rid, info.pgid, recovery_info.soid,
+ MOSDSubOp *subop = new MOSDSubOp(rid, get_info().pgid, recovery_info.soid,
false, CEPH_OSD_FLAG_ACK,
get_osdmap()->get_epoch(), tid,
recovery_info.version);
@@ -5938,7 +6176,7 @@ int ReplicatedPG::send_pull_legacy(int prio, int peer,
return 0;
}
-void ReplicatedPG::submit_push_data(
+void ReplicatedBackend::submit_push_data(
ObjectRecoveryInfo &recovery_info,
bool first,
bool complete,
@@ -5960,9 +6198,7 @@ void ReplicatedPG::submit_push_data(
}
if (first) {
- pg_log.revise_have(recovery_info.soid, eversion_t());
- remove_snap_mapped_object(*t, recovery_info.soid);
- t->remove(coll, recovery_info.soid);
+ get_parent()->on_local_recover_start(recovery_info.soid, t);
t->remove(get_temp_coll(t), recovery_info.soid);
t->touch(target_coll, recovery_info.soid);
t->omap_setheader(target_coll, recovery_info.soid, omap_header);
@@ -5996,8 +6232,8 @@ void ReplicatedPG::submit_push_data(
}
}
-void ReplicatedPG::submit_push_complete(ObjectRecoveryInfo &recovery_info,
- ObjectStore::Transaction *t)
+void ReplicatedBackend::submit_push_complete(ObjectRecoveryInfo &recovery_info,
+ ObjectStore::Transaction *t)
{
for (map<hobject_t, interval_set<uint64_t> >::const_iterator p =
recovery_info.clone_subset.begin();
@@ -6012,67 +6248,29 @@ void ReplicatedPG::submit_push_complete(ObjectRecoveryInfo &recovery_info,
q.get_start(), q.get_len(), q.get_start());
}
}
-
- if (recovery_info.soid.snap < CEPH_NOSNAP) {
- assert(recovery_info.oi.snaps.size());
- OSDriver::OSTransaction _t(osdriver.get_transaction(t));
- set<snapid_t> snaps(
- recovery_info.oi.snaps.begin(),
- recovery_info.oi.snaps.end());
- snap_mapper.add_oid(
- recovery_info.soid,
- snaps,
- &_t);
- }
-
- if (pg_log.get_missing().is_missing(recovery_info.soid) &&
- pg_log.get_missing().missing.find(recovery_info.soid)->second.need > recovery_info.version) {
- assert(is_primary());
- const pg_log_entry_t *latest = pg_log.get_log().objects.find(recovery_info.soid)->second;
- if (latest->op == pg_log_entry_t::LOST_REVERT &&
- latest->reverting_to == recovery_info.version) {
- dout(10) << " got old revert version " << recovery_info.version
- << " for " << *latest << dendl;
- recovery_info.version = latest->version;
- // update the attr to the revert event version
- recovery_info.oi.prior_version = recovery_info.oi.version;
- recovery_info.oi.version = latest->version;
- bufferlist bl;
- ::encode(recovery_info.oi, bl);
- t->setattr(coll, recovery_info.soid, OI_ATTR, bl);
- }
- }
- recover_got(recovery_info.soid, recovery_info.version);
-
- // update pg
- dirty_info = true;
- write_if_dirty(*t);
}
-ObjectRecoveryInfo ReplicatedPG::recalc_subsets(const ObjectRecoveryInfo& recovery_info)
+ObjectRecoveryInfo ReplicatedBackend::recalc_subsets(
+ const ObjectRecoveryInfo& recovery_info,
+ SnapSetContext *ssc)
{
if (!recovery_info.soid.snap || recovery_info.soid.snap >= CEPH_NOSNAP)
return recovery_info;
-
- SnapSetContext *ssc = get_snapset_context(recovery_info.soid.oid,
- recovery_info.soid.get_key(),
- recovery_info.soid.hash,
- false,
- recovery_info.soid.get_namespace());
- assert(ssc);
ObjectRecoveryInfo new_info = recovery_info;
new_info.copy_subset.clear();
new_info.clone_subset.clear();
assert(ssc);
- calc_clone_subsets(ssc->snapset, new_info.soid, pg_log.get_missing(), info.last_backfill,
+ calc_clone_subsets(ssc->snapset, new_info.soid, get_parent()->get_local_missing(),
+ get_info().last_backfill,
new_info.copy_subset, new_info.clone_subset);
- put_snapset_context(ssc);
return new_info;
}
-bool ReplicatedPG::handle_pull_response(
+bool ReplicatedBackend::handle_pull_response(
int from, PushOp &pop, PullOp *response,
- ObjectStore::Transaction *t)
+ list<ObjectContextRef> *to_continue,
+ ObjectStore::Transaction *t
+ )
{
interval_set<uint64_t> data_included = pop.data_included;
bufferlist data;
@@ -6104,7 +6302,13 @@ bool ReplicatedPG::handle_pull_response(
pop.recovery_info.copy_subset);
}
- pi.recovery_info = recalc_subsets(pi.recovery_info);
+ bool first = pi.recovery_progress.first;
+ if (first) {
+ pi.obc = get_parent()->get_obc(pi.recovery_info.soid, pop.attrset);
+ pi.recovery_info.oi = pi.obc->obs.oi;
+ pi.recovery_info = recalc_subsets(pi.recovery_info, pi.obc->ssc);
+ }
+
interval_set<uint64_t> usable_intervals;
bufferlist usable_data;
@@ -6116,33 +6320,15 @@ bool ReplicatedPG::handle_pull_response(
data_included = usable_intervals;
data.claim(usable_data);
- info.stats.stats.sum.num_bytes_recovered += data.length();
- bool first = pi.recovery_progress.first;
pi.recovery_progress = pop.after_progress;
+ pi.stat.num_bytes_recovered += data.length();
+
dout(10) << "new recovery_info " << pi.recovery_info
<< ", new progress " << pi.recovery_progress
<< dendl;
- if (first) {
- bufferlist oibl;
- if (pop.attrset.count(OI_ATTR)) {
- oibl.push_back(pop.attrset[OI_ATTR]);
- ::decode(pi.recovery_info.oi, oibl);
- } else {
- assert(0);
- }
- bufferlist ssbl;
- if (pop.attrset.count(SS_ATTR)) {
- ssbl.push_back(pop.attrset[SS_ATTR]);
- ::decode(pi.recovery_info.ss, ssbl);
- } else {
- assert(pi.recovery_info.soid.snap != CEPH_NOSNAP &&
- pi.recovery_info.soid.snap != CEPH_SNAPDIR);
- }
- }
-
bool complete = pi.is_complete();
submit_push_data(pi.recovery_info, first,
@@ -6153,53 +6339,17 @@ bool ReplicatedPG::handle_pull_response(
pop.omap_entries,
t);
- info.stats.stats.sum.num_keys_recovered += pop.omap_entries.size();
-
- if (complete) {
- info.stats.stats.sum.num_objects_recovered++;
-
- SnapSetContext *ssc;
- if (hoid.snap == CEPH_NOSNAP || hoid.snap == CEPH_SNAPDIR) {
- ssc = create_snapset_context(hoid.oid);
- ssc->snapset = pi.recovery_info.ss;
- } else {
- ssc = get_snapset_context(hoid.oid, hoid.get_key(), hoid.hash, false,
- hoid.get_namespace());
- assert(ssc);
- }
- ObjectContextRef obc = create_object_context(pi.recovery_info.oi, ssc);
- obc->obs.exists = true;
-
- obc->ondisk_write_lock();
-
- // keep track of active pushes for scrub
- ++active_pushes;
-
- t->register_on_applied(new C_OSD_AppliedRecoveredObject(this, obc));
- t->register_on_applied_sync(new C_OSD_OndiskWriteUnlock(obc));
- t->register_on_complete(
- new C_OSD_CompletedPull(this, hoid, get_osdmap()->get_epoch()));
- }
-
- t->register_on_commit(
- new C_OSD_CommittedPushedObject(
- this,
- get_osdmap()->get_epoch(),
- info.last_complete));
+ pi.stat.num_keys_recovered += pop.omap_entries.size();
if (complete) {
+ to_continue->push_back(pi.obc);
+ pi.stat.num_objects_recovered++;
+ get_parent()->on_local_recover(
+ hoid, pi.stat, pi.recovery_info, pi.obc, t);
pulling.erase(hoid);
pull_from_peer[from].erase(hoid);
- publish_stats_to_osd();
- if (waiting_for_missing_object.count(hoid)) {
- dout(20) << " kicking waiters on " << hoid << dendl;
- requeue_ops(waiting_for_missing_object[hoid]);
- waiting_for_missing_object.erase(hoid);
- if (pg_log.get_missing().missing.size() == 0) {
- requeue_ops(waiting_for_all_missing);
- waiting_for_all_missing.clear();
- }
- }
+ if (pull_from_peer[from].empty())
+ pull_from_peer.erase(from);
return false;
} else {
response->soid = pop.soid;
@@ -6215,11 +6365,11 @@ struct C_OnPushCommit : public Context {
C_OnPushCommit(ReplicatedPG *pg, OpRequestRef op) : pg(pg), op(op) {}
void finish(int) {
op->mark_event("committed");
- pg->log_subop_stats(op, l_osd_push_inb, l_osd_sop_push_lat);
+ log_subop_stats(pg->osd, op, l_osd_push_inb, l_osd_sop_push_lat);
}
};
-void ReplicatedPG::handle_push(
+void ReplicatedBackend::handle_push(
int from, PushOp &pop, PushReplyOp *response,
ObjectStore::Transaction *t)
{
@@ -6233,12 +6383,7 @@ void ReplicatedPG::handle_push(
bool complete = pop.after_progress.data_complete &&
pop.after_progress.omap_complete;
- // keep track of active pushes for scrub
- ++active_pushes;
-
response->soid = pop.recovery_info.soid;
- t->register_on_applied(
- new C_OSD_AppliedRecoveredObjectReplica(this));
submit_push_data(pop.recovery_info,
first,
complete,
@@ -6249,14 +6394,16 @@ void ReplicatedPG::handle_push(
pop.omap_entries,
t);
- t->register_on_commit(
- new C_OSD_CommittedPushedObject(
- this,
- get_osdmap()->get_epoch(),
- info.last_complete));
+ if (complete)
+ get_parent()->on_local_recover(
+ pop.recovery_info.soid,
+ object_stat_sum_t(),
+ pop.recovery_info,
+ ObjectContextRef(), // ok, is replica
+ t);
}
-void ReplicatedPG::send_pushes(int prio, map<int, vector<PushOp> > &pushes)
+void ReplicatedBackend::send_pushes(int prio, map<int, vector<PushOp> > &pushes)
{
for (map<int, vector<PushOp> >::iterator i = pushes.begin();
i != pushes.end();
@@ -6280,7 +6427,7 @@ void ReplicatedPG::send_pushes(int prio, map<int, vector<PushOp> > &pushes)
uint64_t cost = 0;
uint64_t pushes = 0;
MOSDPGPush *msg = new MOSDPGPush();
- msg->pgid = info.pgid;
+ msg->pgid = get_info().pgid;
msg->map_epoch = get_osdmap()->get_epoch();
msg->set_priority(prio);
for (;
@@ -6301,7 +6448,7 @@ void ReplicatedPG::send_pushes(int prio, map<int, vector<PushOp> > &pushes)
}
}
-void ReplicatedPG::send_pulls(int prio, map<int, vector<PullOp> > &pulls)
+void ReplicatedBackend::send_pulls(int prio, map<int, vector<PullOp> > &pulls)
{
for (map<int, vector<PullOp> >::iterator i = pulls.begin();
i != pulls.end();
@@ -6328,7 +6475,7 @@ void ReplicatedPG::send_pulls(int prio, map<int, vector<PullOp> > &pulls)
<< " to osd." << i->first << dendl;
MOSDPGPull *msg = new MOSDPGPull();
msg->set_priority(prio);
- msg->pgid = info.pgid;
+ msg->pgid = get_info().pgid;
msg->map_epoch = get_osdmap()->get_epoch();
msg->pulls.swap(i->second);
msg->compute_cost(cct);
@@ -6337,22 +6484,11 @@ void ReplicatedPG::send_pulls(int prio, map<int, vector<PullOp> > &pulls)
}
}
-int ReplicatedPG::send_push(int prio, int peer,
- const ObjectRecoveryInfo &recovery_info,
- const ObjectRecoveryProgress &progress,
- ObjectRecoveryProgress *out_progress)
-{
- PushOp op;
- int r = build_push_op(recovery_info, progress, out_progress, &op);
- if (r < 0)
- return r;
- return send_push_op_legacy(prio, peer, op);
-}
-
-int ReplicatedPG::build_push_op(const ObjectRecoveryInfo &recovery_info,
- const ObjectRecoveryProgress &progress,
- ObjectRecoveryProgress *out_progress,
- PushOp *out_op)
+int ReplicatedBackend::build_push_op(const ObjectRecoveryInfo &recovery_info,
+ const ObjectRecoveryProgress &progress,
+ ObjectRecoveryProgress *out_progress,
+ PushOp *out_op,
+ object_stat_sum_t *stat)
{
ObjectRecoveryProgress _new_progress;
if (!out_progress)
@@ -6376,7 +6512,7 @@ int ReplicatedPG::build_push_op(const ObjectRecoveryInfo &recovery_info,
object_info_t oi(bv);
if (oi.version != recovery_info.version) {
- osd->clog.error() << info.pgid << " push "
+ osd->clog.error() << get_info().pgid << " push "
<< recovery_info.soid << " v "
<< " failed because local copy is "
<< oi.version << "\n";
@@ -6439,11 +6575,14 @@ int ReplicatedPG::build_push_op(const ObjectRecoveryInfo &recovery_info,
if (new_progress.is_complete(recovery_info)) {
new_progress.data_complete = true;
- info.stats.stats.sum.num_objects_recovered++;
+ if (stat)
+ stat->num_objects_recovered++;
}
- info.stats.stats.sum.num_keys_recovered += out_op->omap_entries.size();
- info.stats.stats.sum.num_bytes_recovered += out_op->data.length();
+ if (stat) {
+ stat->num_keys_recovered += out_op->omap_entries.size();
+ stat->num_bytes_recovered += out_op->data.length();
+ }
osd->logger->inc(l_osd_push);
osd->logger->inc(l_osd_push_outb, out_op->data.length());
@@ -6457,11 +6596,11 @@ int ReplicatedPG::build_push_op(const ObjectRecoveryInfo &recovery_info,
return 0;
}
-int ReplicatedPG::send_push_op_legacy(int prio, int peer, PushOp &pop)
+int ReplicatedBackend::send_push_op_legacy(int prio, int peer, PushOp &pop)
{
tid_t tid = osd->get_tid();
osd_reqid_t rid(osd->get_cluster_msgr_name(), 0, tid);
- MOSDSubOp *subop = new MOSDSubOp(rid, info.pgid, pop.soid,
+ MOSDSubOp *subop = new MOSDSubOp(rid, get_info().pgid, pop.soid,
false, 0, get_osdmap()->get_epoch(),
tid, pop.recovery_info.version);
subop->ops = vector<OSDOp>(1);
@@ -6482,16 +6621,16 @@ int ReplicatedPG::send_push_op_legacy(int prio, int peer, PushOp &pop)
return 0;
}
-void ReplicatedPG::prep_push_op_blank(const hobject_t& soid, PushOp *op)
+void ReplicatedBackend::prep_push_op_blank(const hobject_t& soid, PushOp *op)
{
op->recovery_info.version = eversion_t();
op->version = eversion_t();
op->soid = soid;
}
-void ReplicatedPG::sub_op_push_reply(OpRequestRef op)
+void ReplicatedBackend::sub_op_push_reply(OpRequestRef op)
{
- MOSDSubOpReply *reply = static_cast<MOSDSubOpReply*>(op->request);
+ MOSDSubOpReply *reply = static_cast<MOSDSubOpReply*>(op->get_req());
const hobject_t& soid = reply->get_poid();
assert(reply->get_header().type == MSG_OSD_SUBOPREPLY);
dout(10) << "sub_op_push_reply from " << reply->get_source() << " " << *reply << dendl;
@@ -6504,10 +6643,10 @@ void ReplicatedPG::sub_op_push_reply(OpRequestRef op)
PushOp pop;
bool more = handle_push_reply(peer, rop, &pop);
if (more)
- send_push_op_legacy(pushing[soid][peer].priority, peer, pop);
+ send_push_op_legacy(op->get_req()->get_priority(), peer, pop);
}
-bool ReplicatedPG::handle_push_reply(int peer, PushReplyOp &op, PushOp *reply)
+bool ReplicatedBackend::handle_push_reply(int peer, PushReplyOp &op, PushOp *reply)
{
const hobject_t &soid = op.soid;
if (pushing.count(soid) == 0) {
@@ -6527,32 +6666,25 @@ bool ReplicatedPG::handle_push_reply(int peer, PushReplyOp &op, PushOp *reply)
<< pi->recovery_progress.data_recovered_to
<< " of " << pi->recovery_info.copy_subset << dendl;
ObjectRecoveryProgress new_progress;
- build_push_op(
+ int r = build_push_op(
pi->recovery_info,
- pi->recovery_progress, &new_progress, reply);
+ pi->recovery_progress, &new_progress, reply,
+ &(pi->stat));
+ assert(r == 0);
pi->recovery_progress = new_progress;
return true;
} else {
// done!
- if (peer == backfill_target && backfills_in_flight.count(soid))
- backfills_in_flight.erase(soid);
- else
- peer_missing[peer].got(soid, pi->recovery_info.version);
+ get_parent()->on_peer_recover(
+ peer, soid, pi->recovery_info,
+ pi->stat);
pushing[soid].erase(peer);
pi = NULL;
- publish_stats_to_osd();
if (pushing[soid].empty()) {
- pushing.erase(soid);
- dout(10) << "pushed " << soid << " to all replicas" << dendl;
- finish_recovery_op(soid);
- if (waiting_for_degraded_object.count(soid)) {
- requeue_ops(waiting_for_degraded_object[soid]);
- waiting_for_degraded_object.erase(soid);
- }
- finish_degraded_object(soid);
+ get_parent()->on_global_recover(soid);
} else {
dout(10) << "pushed " << soid << ", still waiting for push ack from "
<< pushing[soid].size() << " others" << dendl;
@@ -6590,9 +6722,9 @@ void ReplicatedPG::finish_degraded_object(const hobject_t& oid)
* process request to pull an entire object.
* NOTE: called from opqueue.
*/
-void ReplicatedPG::sub_op_pull(OpRequestRef op)
+void ReplicatedBackend::sub_op_pull(OpRequestRef op)
{
- MOSDSubOp *m = static_cast<MOSDSubOp*>(op->request);
+ MOSDSubOp *m = static_cast<MOSDSubOp*>(op->get_req());
assert(m->get_header().type == MSG_OSD_SUBOP);
op->mark_started();
@@ -6617,16 +6749,17 @@ void ReplicatedPG::sub_op_pull(OpRequestRef op)
m->get_source().num(),
reply);
- log_subop_stats(op, 0, l_osd_sop_pull_lat);
+ log_subop_stats(osd, op, 0, l_osd_sop_pull_lat);
}
-void ReplicatedPG::handle_pull(int peer, PullOp &op, PushOp *reply)
+void ReplicatedBackend::handle_pull(int peer, PullOp &op, PushOp *reply)
{
const hobject_t &soid = op.soid;
struct stat st;
int r = osd->store->stat(coll, soid, &st);
if (r != 0) {
- osd->clog.error() << info.pgid << " " << peer << " tried to pull " << soid
+ osd->clog.error() << get_info().pgid << " "
+ << peer << " tried to pull " << soid
<< " but got " << cpp_strerror(-r) << "\n";
prep_push_op_blank(soid, reply);
} else {
@@ -6743,7 +6876,7 @@ void ReplicatedPG::recover_got(hobject_t oid, eversion_t v)
* @param intervals_usable intervals we want to keep
* @param data_usable matching data we want to keep
*/
-void ReplicatedPG::trim_pushed_data(
+void ReplicatedBackend::trim_pushed_data(
const interval_set<uint64_t> &copy_subset,
const interval_set<uint64_t> &intervals_received,
bufferlist data_received,
@@ -6781,10 +6914,10 @@ void ReplicatedPG::trim_pushed_data(
/** op_push
* NOTE: called from opqueue.
*/
-void ReplicatedPG::sub_op_push(OpRequestRef op)
+void ReplicatedBackend::sub_op_push(OpRequestRef op)
{
op->mark_started();
- MOSDSubOp *m = static_cast<MOSDSubOp *>(op->request);
+ MOSDSubOp *m = static_cast<MOSDSubOp *>(op->get_req());
PushOp pop;
pop.soid = m->recovery_info.soid;
@@ -6801,14 +6934,29 @@ void ReplicatedPG::sub_op_push(OpRequestRef op)
if (is_primary()) {
PullOp resp;
- bool more = handle_pull_response(m->get_source().num(), pop, &resp, t);
+ RPGHandle *h = _open_recovery_op();
+ list<ObjectContextRef> to_continue;
+ bool more = handle_pull_response(
+ m->get_source().num(), pop, &resp,
+ &to_continue, t);
if (more) {
send_pull_legacy(
m->get_priority(),
m->get_source().num(),
resp.recovery_info,
resp.recovery_progress);
- }
+ } else {
+ C_ReplicatedBackend_OnPullComplete *c =
+ new C_ReplicatedBackend_OnPullComplete(
+ this,
+ op->get_req()->get_priority());
+ c->to_continue.swap(to_continue);
+ t->register_on_complete(
+ new C_QueueInWQ(
+ &osd->push_wq,
+ get_parent()->bless_gencontext(c)));
+ }
+ run_recovery_op(h, op->get_req()->get_priority());
} else {
PushReplyOp resp;
MOSDSubOpReply *reply = new MOSDSubOpReply(
@@ -6817,15 +6965,16 @@ void ReplicatedPG::sub_op_push(OpRequestRef op)
assert(entity_name_t::TYPE_OSD == m->get_connection()->peer_type);
handle_push(m->get_source().num(), pop, &resp, t);
t->register_on_complete(new C_OSD_SendMessageOnConn(
- osd, reply, m->get_connection()));
+ osd, reply, m->get_connection()));
}
- t->register_on_commit(new C_OnPushCommit(this, op));
- osd->store->queue_transaction(osr.get(), t);
+ get_parent()->queue_transaction(t);
return;
}
-void ReplicatedPG::_failed_push(int from, const hobject_t &soid)
+void ReplicatedPG::failed_push(int from, const hobject_t &soid)
{
+ assert(recovering.count(soid));
+ recovering.erase(soid);
map<hobject_t,set<int> >::iterator p = missing_loc.find(soid);
if (p != missing_loc.end()) {
dout(0) << "_failed_push " << soid << " from osd." << from
@@ -6838,15 +6987,21 @@ void ReplicatedPG::_failed_push(int from, const hobject_t &soid)
dout(0) << "_failed_push " << soid << " from osd." << from
<< " but not in missing_loc ???" << dendl;
}
-
finish_recovery_op(soid); // close out this attempt,
+}
+
+void ReplicatedBackend::_failed_push(int from, const hobject_t &soid)
+{
+ get_parent()->failed_push(from, soid);
pull_from_peer[from].erase(soid);
+ if (pull_from_peer[from].empty())
+ pull_from_peer.erase(from);
pulling.erase(soid);
}
void ReplicatedPG::sub_op_remove(OpRequestRef op)
{
- MOSDSubOp *m = static_cast<MOSDSubOp*>(op->request);
+ MOSDSubOp *m = static_cast<MOSDSubOp*>(op->get_req());
assert(m->get_header().type == MSG_OSD_SUBOP);
dout(7) << "sub_op_remove " << m->poid << dendl;
@@ -6907,7 +7062,7 @@ ObjectContextRef ReplicatedPG::mark_object_lost(ObjectStore::Transaction *t,
obc->ondisk_write_lock();
- obc->obs.oi.lost = true;
+ obc->obs.oi.set_flag(object_info_t::FLAG_LOST);
obc->obs.oi.version = info.last_update;
obc->obs.oi.prior_version = version;
@@ -7069,7 +7224,7 @@ void ReplicatedPG::apply_and_flush_repops(bool requeue)
if (requeue) {
if (repop->ctx->op) {
- dout(10) << " requeuing " << *repop->ctx->op->request << dendl;
+ dout(10) << " requeuing " << *repop->ctx->op->get_req() << dendl;
rq.push_back(repop->ctx->op);
repop->ctx->op = OpRequestRef();
}
@@ -7124,7 +7279,7 @@ void ReplicatedPG::on_shutdown()
deleting = true;
unreg_next_scrub();
- requeue_cancel_copy_ops(false);
+ cancel_copy_ops();
apply_and_flush_repops(false);
context_registry_on_change();
@@ -7136,20 +7291,6 @@ void ReplicatedPG::on_shutdown()
cancel_recovery();
}
-void ReplicatedPG::on_flushed()
-{
- assert(object_contexts.empty());
- if (have_temp_coll() &&
- !osd->store->collection_empty(get_temp_coll())) {
- vector<hobject_t> objects;
- osd->store->collection_list(get_temp_coll(), objects);
- derr << __func__ << ": found objects in the temp collection: "
- << objects << ", crashing now"
- << dendl;
- assert(0 == "found garbage in the temp collection");
- }
-}
-
void ReplicatedPG::on_activate()
{
for (unsigned i = 1; i<acting.size(); i++) {
@@ -7175,7 +7316,7 @@ void ReplicatedPG::on_change(ObjectStore::Transaction *t)
context_registry_on_change();
- requeue_cancel_copy_ops(is_primary());
+ cancel_copy_ops();
// requeue object waiters
if (is_primary()) {
@@ -7212,20 +7353,7 @@ void ReplicatedPG::on_change(ObjectStore::Transaction *t)
// any dups
apply_and_flush_repops(is_primary());
- // clear pushing/pulling maps
- pushing.clear();
- pulling.clear();
- pull_from_peer.clear();
-
- // clear temp
- for (set<hobject_t>::iterator i = temp_contents.begin();
- i != temp_contents.end();
- ++i) {
- dout(10) << __func__ << ": Removing oid "
- << *i << " from the temp collection" << dendl;
- t->remove(get_temp_coll(t), *i);
- }
- temp_contents.clear();
+ pgbackend->on_change(t);
// clear snap_trimmer state
snap_trimmer_machine.process_event(Reset());
@@ -7251,9 +7379,16 @@ void ReplicatedPG::_clear_recovery_state()
backfill_pos = hobject_t();
backfills_in_flight.clear();
pending_backfill_updates.clear();
- pulling.clear();
- pushing.clear();
- pull_from_peer.clear();
+ recovering.clear();
+ pgbackend->clear_state();
+}
+
+void ReplicatedPG::cancel_pull(const hobject_t &soid)
+{
+ assert(recovering.count(soid));
+ recovering.erase(soid);
+ finish_recovery_op(soid);
+ pg_log.set_last_requested(0); // get recover_primary to start over
}
void ReplicatedPG::check_recovery_sources(const OSDMapRef osdmap)
@@ -7272,26 +7407,10 @@ void ReplicatedPG::check_recovery_sources(const OSDMapRef osdmap)
}
dout(10) << "check_recovery_sources source osd." << *p << " now down" << dendl;
now_down.insert(*p);
-
- // reset pulls?
- map<int, set<hobject_t> >::iterator j = pull_from_peer.find(*p);
- if (j != pull_from_peer.end()) {
- dout(10) << "check_recovery_sources resetting pulls from osd." << *p
- << ", osdmap has it marked down" << dendl;
- for (set<hobject_t>::iterator i = j->second.begin();
- i != j->second.end();
- ++i) {
- assert(pulling.count(*i) == 1);
- pulling.erase(*i);
- finish_recovery_op(*i);
- }
- pg_log.set_last_requested(0);
- pull_from_peer.erase(j++);
- }
-
- // remove from missing_loc_sources
missing_loc_sources.erase(p++);
}
+ pgbackend->check_recovery_sources(osdmap);
+
if (now_down.empty()) {
dout(10) << "check_recovery_sources no source osds (" << missing_loc_sources << ") went down" << dendl;
} else {
@@ -7377,7 +7496,8 @@ int ReplicatedPG::start_recovery_ops(
}
bool deferred_backfill = false;
- if (state_test(PG_STATE_BACKFILL) &&
+ if (recovering.empty() &&
+ state_test(PG_STATE_BACKFILL) &&
backfill_target >= 0 && started < max &&
missing.num_missing() == 0 &&
!waiting_on_backfill) {
@@ -7405,9 +7525,11 @@ int ReplicatedPG::start_recovery_ops(
dout(10) << " started " << started << dendl;
osd->logger->inc(l_osd_rop, started);
- if (started || recovery_ops_active > 0 || deferred_backfill)
+ if (!recovering.empty() ||
+ started || recovery_ops_active > 0 || deferred_backfill)
return started;
+ assert(recovering.empty());
assert(recovery_ops_active == 0);
int unfound = get_num_unfound();
@@ -7473,7 +7595,8 @@ int ReplicatedPG::recover_primary(int max, ThreadPool::TPHandle &handle)
const pg_missing_t &missing = pg_log.get_missing();
- dout(10) << "recover_primary pulling " << pulling.size() << " in pg" << dendl;
+ dout(10) << "recover_primary recovering " << recovering.size()
+ << " in pg" << dendl;
dout(10) << "recover_primary " << missing << dendl;
dout(25) << "recover_primary " << missing.missing << dendl;
@@ -7482,7 +7605,7 @@ int ReplicatedPG::recover_primary(int max, ThreadPool::TPHandle &handle)
int started = 0;
int skipped = 0;
- map<int, vector<PullOp> > pulls;
+ PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
map<version_t, hobject_t>::const_iterator p =
missing.rmissing.lower_bound(pg_log.get_log().last_requested);
while (p != missing.rmissing.end()) {
@@ -7513,8 +7636,8 @@ int ReplicatedPG::recover_primary(int max, ThreadPool::TPHandle &handle)
<< (unfound ? " (unfound)":"")
<< (missing.is_missing(soid) ? " (missing)":"")
<< (missing.is_missing(head) ? " (missing head)":"")
- << (pulling.count(soid) ? " (pulling)":"")
- << (pulling.count(head) ? " (pulling head)":"")
+ << (recovering.count(soid) ? " (recovering)":"")
+ << (recovering.count(head) ? " (recovering head)":"")
<< dendl;
if (latest) {
@@ -7589,14 +7712,14 @@ int ReplicatedPG::recover_primary(int max, ThreadPool::TPHandle &handle)
}
}
- if (!pulling.count(soid)) {
- if (pulling.count(head)) {
+ if (!recovering.count(soid)) {
+ if (recovering.count(head)) {
++skipped;
} else if (unfound) {
++skipped;
} else {
- int r = prepare_pull(
- soid, need, cct->_conf->osd_recovery_op_priority, &pulls);
+ int r = recover_missing(
+ soid, need, cct->_conf->osd_recovery_op_priority, h);
switch (r) {
case PULL_YES:
++started;
@@ -7618,14 +7741,14 @@ int ReplicatedPG::recover_primary(int max, ThreadPool::TPHandle &handle)
if (!skipped)
pg_log.set_last_requested(v);
}
-
- send_pulls(cct->_conf->osd_recovery_op_priority, pulls);
+
+ pgbackend->run_recovery_op(h, cct->_conf->osd_recovery_op_priority);
return started;
}
int ReplicatedPG::prep_object_replica_pushes(
- const hobject_t& soid, eversion_t v, int prio,
- map<int, vector<PushOp> > *pushes)
+ const hobject_t& soid, eversion_t v,
+ PGBackend::RecoveryHandle *h)
{
dout(10) << __func__ << ": on " << soid << dendl;
@@ -7652,30 +7775,46 @@ int ReplicatedPG::prep_object_replica_pushes(
return 0;
}
- dout(10) << " ondisk_read_lock for " << soid << dendl;
+ start_recovery_op(soid);
+ assert(!recovering.count(soid));
+ recovering.insert(soid);
+
+ /* We need this in case there is an in progress write on the object. In fact,
+ * the only possible write is an update to the xattr due to a lost_revert --
+ * a client write would be blocked since the object is degraded.
+ * In almost all cases, therefore, this lock should be uncontended.
+ */
obc->ondisk_read_lock();
-
+ pgbackend->recover_object(
+ soid,
+ ObjectContextRef(),
+ obc, // has snapset context
+ h);
+ obc->ondisk_read_unlock();
+ return 1;
+}
+
+int ReplicatedBackend::start_pushes(
+ const hobject_t &soid,
+ ObjectContextRef obc,
+ RPGHandle *h)
+{
+ int pushes = 0;
// who needs it?
- bool started = false;
- for (unsigned i=1; i<acting.size(); i++) {
- int peer = acting[i];
- if (peer_missing.count(peer) &&
- peer_missing[peer].is_missing(soid)) {
- if (!started) {
- start_recovery_op(soid);
- started = true;
- }
- (*pushes)[peer].push_back(PushOp());
- prep_push_to_replica(obc, soid, peer, prio,
- &((*pushes)[peer].back())
+ for (unsigned i=1; i<get_parent()->get_acting().size(); i++) {
+ int peer = get_parent()->get_acting()[i];
+ map<int, pg_missing_t>::const_iterator j =
+ get_parent()->get_peer_missing().find(peer);
+ assert(j != get_parent()->get_peer_missing().end());
+ if (j->second.is_missing(soid)) {
+ ++pushes;
+ h->pushes[peer].push_back(PushOp());
+ prep_push_to_replica(obc, soid, peer,
+ &(h->pushes[peer].back())
);
}
}
-
- dout(10) << " ondisk_read_unlock on " << soid << dendl;
- obc->ondisk_read_unlock();
-
- return 1;
+ return pushes;
}
int ReplicatedPG::recover_replicas(int max, ThreadPool::TPHandle &handle)
@@ -7683,13 +7822,15 @@ int ReplicatedPG::recover_replicas(int max, ThreadPool::TPHandle &handle)
dout(10) << __func__ << "(" << max << ")" << dendl;
int started = 0;
- map<int, vector<PushOp> > pushes;
+ PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
// this is FAR from an optimal recovery order. pretty lame, really.
for (unsigned i=1; i<acting.size(); i++) {
int peer = acting[i];
map<int, pg_missing_t>::const_iterator pm = peer_missing.find(peer);
assert(pm != peer_missing.end());
+ map<int, pg_info_t>::const_iterator pi = peer_info.find(peer);
+ assert(pi != peer_info.end());
size_t m_sz = pm->second.num_missing();
dout(10) << " peer osd." << peer << " missing " << m_sz << " objects." << dendl;
@@ -7703,8 +7844,17 @@ int ReplicatedPG::recover_replicas(int max, ThreadPool::TPHandle &handle)
handle.reset_tp_timeout();
const hobject_t soid(p->second);
- if (pushing.count(soid)) {
- dout(10) << __func__ << ": already pushing " << soid << dendl;
+ if (soid > pi->second.last_backfill) {
+ if (!recovering.count(soid)) {
+ derr << __func__ << ": object added to missing set for backfill, but "
+ << "is not in recovering, error!" << dendl;
+ assert(0);
+ }
+ continue;
+ }
+
+ if (recovering.count(soid)) {
+ dout(10) << __func__ << ": already recovering" << soid << dendl;
continue;
}
@@ -7719,13 +7869,11 @@ int ReplicatedPG::recover_replicas(int max, ThreadPool::TPHandle &handle)
dout(10) << __func__ << ": recover_object_replicas(" << soid << ")" << dendl;
map<hobject_t,pg_missing_t::item>::const_iterator r = m.missing.find(soid);
started += prep_object_replica_pushes(soid, r->second.need,
- cct->_conf->osd_recovery_op_priority,
- &pushes);
+ h);
}
}
- send_pushes(cct->_conf->osd_recovery_op_priority, pushes);
-
+ pgbackend->run_recovery_op(h, cct->_conf->osd_recovery_op_priority);
return started;
}
@@ -7772,17 +7920,12 @@ int ReplicatedPG::recover_backfill(
<< " interval " << pbi.begin << "-" << pbi.end
<< " " << pbi.objects.size() << " objects" << dendl;
- int local_min = osd->store->get_ideal_list_min();
- int local_max = osd->store->get_ideal_list_max();
+ int local_min = cct->_conf->osd_backfill_scan_min;
+ int local_max = cct->_conf->osd_backfill_scan_max;
- // re-scan our local interval to cope with recent changes
- // FIXME: we could track the eversion_t when we last scanned, and invalidate
- // that way. or explicitly modify/invalidate when we actually change specific
- // objects.
- dout(10) << " rescanning local backfill_info from " << backfill_pos << dendl;
- backfill_info.clear();
- osr->flush();
- scan_range(backfill_pos, local_min, local_max, &backfill_info, handle);
+ // update our local interval to cope with recent changes
+ backfill_info.begin = backfill_pos;
+ update_range(&backfill_info, handle);
int ops = 0;
map<hobject_t, pair<eversion_t, eversion_t> > to_push;
@@ -7796,7 +7939,8 @@ int ReplicatedPG::recover_backfill(
if (backfill_info.begin <= pbi.begin &&
!backfill_info.extends_to_end() && backfill_info.empty()) {
osr->flush();
- scan_range(backfill_info.end, local_min, local_max, &backfill_info,
+ backfill_info.begin = backfill_info.end;
+ scan_range(local_min, local_max, &backfill_info,
handle);
backfill_info.trim();
}
@@ -7888,15 +8032,16 @@ int ReplicatedPG::recover_backfill(
send_remove_op(i->first, i->second, backfill_target);
}
+ PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
map<int, vector<PushOp> > pushes;
for (map<hobject_t, pair<eversion_t, eversion_t> >::iterator i = to_push.begin();
i != to_push.end();
++i) {
handle.reset_tp_timeout();
prep_backfill_object_push(
- i->first, i->second.first, i->second.second, backfill_target, &pushes);
+ i->first, i->second.first, i->second.second, backfill_target, h);
}
- send_pushes(cct->_conf->osd_recovery_op_priority, pushes);
+ pgbackend->run_recovery_op(h, cct->_conf->osd_recovery_op_priority);
release_waiting_for_backfill_pos();
dout(5) << "backfill_pos is " << backfill_pos << " and pinfo.last_backfill is "
@@ -7942,35 +8087,95 @@ int ReplicatedPG::recover_backfill(
void ReplicatedPG::prep_backfill_object_push(
hobject_t oid, eversion_t v, eversion_t have, int peer,
- map<int, vector<PushOp> > *pushes)
+ PGBackend::RecoveryHandle *h)
{
dout(10) << "push_backfill_object " << oid << " v " << v << " to osd." << peer << dendl;
backfills_in_flight.insert(oid);
+ map<int, pg_missing_t>::iterator bpm = peer_missing.find(backfill_target);
+ assert(bpm != peer_missing.end());
+ bpm->second.add(oid, eversion_t(), eversion_t());
+
+ assert(!recovering.count(oid));
- if (!pushing.count(oid))
- start_recovery_op(oid);
+ start_recovery_op(oid);
+ recovering.insert(oid);
ObjectContextRef obc = get_object_context(oid, false);
+
+ // We need to take the read_lock here in order to flush in-progress writes
obc->ondisk_read_lock();
- (*pushes)[peer].push_back(PushOp());
- prep_push_to_replica(obc, oid, peer, cct->_conf->osd_recovery_op_priority,
- &((*pushes)[peer].back()));
+ pgbackend->recover_object(
+ oid,
+ ObjectContextRef(),
+ obc,
+ h);
obc->ondisk_read_unlock();
}
+void ReplicatedPG::update_range(
+ BackfillInterval *bi,
+ ThreadPool::TPHandle &handle)
+{
+ int local_min = cct->_conf->osd_backfill_scan_min;
+ int local_max = cct->_conf->osd_backfill_scan_max;
+ if (bi->version >= info.last_update) {
+ dout(10) << __func__<< ": bi is current " << dendl;
+ assert(bi->version == info.last_update);
+ } else if (bi->version >= info.log_tail) {
+ assert(!pg_log.get_log().empty());
+ dout(10) << __func__<< ": bi is old, (" << bi->version
+ << ") can be updated with log" << dendl;
+ list<pg_log_entry_t>::const_iterator i =
+ pg_log.get_log().log.end();
+ --i;
+ while (i != pg_log.get_log().log.begin() &&
+ i->version > bi->version) {
+ --i;
+ }
+ if (i->version == bi->version)
+ ++i;
+
+ assert(i != pg_log.get_log().log.end());
+ dout(10) << __func__ << ": updating from version " << i->version
+ << dendl;
+ for (; i != pg_log.get_log().log.end(); ++i) {
+ const hobject_t &soid = i->soid;
+ if (soid >= bi->begin && soid < bi->end) {
+ if (i->is_update()) {
+ dout(10) << __func__ << ": " << i->soid << " updated to version "
+ << i->version << dendl;
+ bi->objects.erase(i->soid);
+ bi->objects.insert(
+ make_pair(
+ i->soid,
+ i->version));
+ } else if (i->is_delete()) {
+ dout(10) << __func__ << ": " << i->soid << " removed" << dendl;
+ bi->objects.erase(i->soid);
+ }
+ }
+ }
+ bi->version = info.last_update;
+ } else {
+ dout(10) << __func__<< ": bi is old, rescanning local backfill_info"
+ << dendl;
+ osr->flush();
+ scan_range(local_min, local_max, &backfill_info, handle);
+ }
+}
+
void ReplicatedPG::scan_range(
- hobject_t begin, int min, int max, BackfillInterval *bi,
+ int min, int max, BackfillInterval *bi,
ThreadPool::TPHandle &handle)
{
assert(is_locked());
- dout(10) << "scan_range from " << begin << dendl;
- bi->begin = begin;
+ dout(10) << "scan_range from " << bi->begin << dendl;
+ bi->version = info.last_update;
bi->objects.clear(); // for good measure
vector<hobject_t> ls;
ls.reserve(max);
- int r = osd->store->collection_list_partial(coll, begin, min, max,
- 0, &ls, &bi->end);
+ int r = pgbackend->objects_list_partial(bi->begin, min, max, 0, &ls, &bi->end);
assert(r >= 0);
dout(10) << " got " << ls.size() << " items, next " << bi->end << dendl;
dout(20) << ls << dendl;
@@ -7985,7 +8190,7 @@ void ReplicatedPG::scan_range(
dout(20) << " " << *p << " " << obc->obs.oi.version << dendl;
} else {
bufferlist bl;
- int r = osd->store->getattr(coll, *p, OI_ATTR, bl);
+ int r = pgbackend->objects_get_attr(*p, OI_ATTR, &bl);
assert(r >= 0);
object_info_t oi(bl);
bi->objects[*p] = oi.version;
diff --git a/src/osd/ReplicatedPG.h b/src/osd/ReplicatedPG.h
index e880bdecade..27c9d1bb605 100644
--- a/src/osd/ReplicatedPG.h
+++ b/src/osd/ReplicatedPG.h
@@ -18,6 +18,7 @@
#define CEPH_REPLICATEDPG_H
#include <boost/optional.hpp>
+#include <boost/tuple/tuple.hpp>
#include "include/assert.h"
#include "common/cmdparse.h"
@@ -33,6 +34,9 @@
#include "common/sharedptr_registry.hpp"
+#include "PGBackend.h"
+#include "ReplicatedBackend.h"
+
class MOSDSubOpReply;
class ReplicatedPG;
@@ -80,7 +84,7 @@ public:
virtual bool filter(bufferlist& xattr_data, bufferlist& outdata);
};
-class ReplicatedPG : public PG {
+class ReplicatedPG : public PG, public PGBackend::Listener {
friend class OSD;
friend class Watch;
@@ -90,17 +94,17 @@ public:
* state associated with a copy operation
*/
struct OpContext;
+ class CopyCallback;
struct CopyOp {
- OpContext *ctx;
+ CopyCallback *cb;
+ ObjectContextRef obc;
hobject_t src;
object_locator_t oloc;
version_t version;
tid_t objecter_tid;
- list<OpRequestRef> waiting;
-
object_copy_cursor_t cursor;
uint64_t size;
utime_t mtime;
@@ -113,15 +117,198 @@ public:
hobject_t temp_oid;
object_copy_cursor_t temp_cursor;
- CopyOp(OpContext *c, hobject_t s, object_locator_t l, version_t v)
- : ctx(c), src(s), oloc(l), version(v),
+ CopyOp(CopyCallback *cb_, ObjectContextRef _obc, hobject_t s, object_locator_t l,
+ version_t v, const hobject_t& dest)
+ : cb(cb_), obc(_obc), src(s), oloc(l), version(v),
objecter_tid(0),
size(0),
- rval(-1)
+ rval(-1),
+ temp_oid(dest)
{}
};
typedef boost::shared_ptr<CopyOp> CopyOpRef;
+ /**
+ * The CopyCallback class defines an interface for completions to the
+ * copy_start code. Users of the copy infrastructure must implement
+ * one and give an instance of the class to start_copy.
+ *
+ * The implementer is responsible for making sure that the CopyCallback
+ * can associate itself with the correct copy operation. The presence
+ * of the closing Transaction ensures that write operations can be performed
+ * atomically with the copy being completed (which doing them in separate
+ * transactions would not allow); if you are doing the copy for a read
+ * op you will have to generate a separate op to finish the copy with.
+ */
+ /// return code, total object size, data in temp object?, final Transaction
+ typedef boost::tuple<int, size_t, bool, ObjectStore::Transaction> CopyResults;
+ class CopyCallback : public GenContext<CopyResults&> {
+ protected:
+ CopyCallback() {}
+ /**
+ * results.get<0>() is the return code: 0 for success; -ECANCELLED if
+ * the operation was cancelled by the local OSD; -errno for other issues.
+ * results.get<1>() is the total size of the object (for updating pg stats)
+ * results.get<2>() indicates whether we have already written data to
+ * the temp object (so it needs to get cleaned up, if the return code
+ * indicates a failure)
+ * results.get<3>() is a Transaction; if non-empty you need to perform
+ * its results before any other accesses to the object in order to
+ * complete the copy.
+ */
+ virtual void finish(CopyResults& results_) = 0;
+
+ public:
+ /// Provide the final size of the copied object to the CopyCallback
+ virtual ~CopyCallback() {};
+ };
+
+ class CopyFromCallback: public CopyCallback {
+ public:
+ CopyResults results;
+ OpContext *ctx;
+ hobject_t temp_obj;
+ CopyFromCallback(OpContext *ctx_, const hobject_t& temp_obj_) :
+ ctx(ctx_), temp_obj(temp_obj_) {}
+ ~CopyFromCallback() {}
+
+ virtual void finish(CopyResults& results_) {
+ results = results_;
+ int r = results.get<0>();
+ if (r >= 0) {
+ ctx->pg->execute_ctx(ctx);
+ }
+ ctx->copy_cb = NULL;
+ if (r < 0) {
+ if (r != -ECANCELED) { // on cancel just toss it out; client resends
+ ctx->pg->osd->reply_op_error(ctx->op, r);
+ }
+ delete ctx;
+ }
+ }
+
+ bool is_temp_obj_used() { return results.get<2>(); }
+ uint64_t get_data_size() { return results.get<1>(); }
+ int get_result() { return results.get<0>(); }
+ };
+ friend class CopyFromCallback;
+
+ boost::scoped_ptr<PGBackend> pgbackend;
+ PGBackend *get_pgbackend() {
+ return pgbackend.get();
+ }
+
+ /// Listener methods
+ void on_local_recover_start(
+ const hobject_t &oid,
+ ObjectStore::Transaction *t);
+ void on_local_recover(
+ const hobject_t &oid,
+ const object_stat_sum_t &stat_diff,
+ const ObjectRecoveryInfo &recovery_info,
+ ObjectContextRef obc,
+ ObjectStore::Transaction *t
+ );
+ void on_peer_recover(
+ int peer,
+ const hobject_t &oid,
+ const ObjectRecoveryInfo &recovery_info,
+ const object_stat_sum_t &stat
+ );
+ void begin_peer_recover(
+ int peer,
+ const hobject_t oid);
+ void on_global_recover(
+ const hobject_t &oid);
+ void failed_push(int from, const hobject_t &soid);
+ void cancel_pull(const hobject_t &soid);
+
+ template <typename T>
+ class BlessedGenContext : public GenContext<T> {
+ ReplicatedPG *pg;
+ GenContext<T> *c;
+ epoch_t e;
+ public:
+ BlessedGenContext(ReplicatedPG *pg, GenContext<T> *c, epoch_t e)
+ : pg(pg), c(c), e(e) {}
+ void finish(T t) {
+ pg->lock();
+ if (pg->pg_has_reset_since(e))
+ delete c;
+ else
+ c->complete(t);
+ pg->unlock();
+ }
+ };
+ class BlessedContext : public Context {
+ ReplicatedPG *pg;
+ Context *c;
+ epoch_t e;
+ public:
+ BlessedContext(ReplicatedPG *pg, Context *c, epoch_t e)
+ : pg(pg), c(c), e(e) {}
+ void finish(int r) {
+ pg->lock();
+ if (pg->pg_has_reset_since(e))
+ delete c;
+ else
+ c->complete(r);
+ pg->unlock();
+ }
+ };
+ Context *bless_context(Context *c) {
+ return new BlessedContext(this, c, get_osdmap()->get_epoch());
+ }
+ GenContext<ThreadPool::TPHandle&> *bless_gencontext(
+ GenContext<ThreadPool::TPHandle&> *c) {
+ return new BlessedGenContext<ThreadPool::TPHandle&>(
+ this, c, get_osdmap()->get_epoch());
+ }
+
+ void send_message(int to_osd, Message *m) {
+ osd->send_message_osd_cluster(to_osd, m, get_osdmap()->get_epoch());
+ }
+ void queue_transaction(ObjectStore::Transaction *t) {
+ osd->store->queue_transaction(osr.get(), t);
+ }
+ epoch_t get_epoch() {
+ return get_osdmap()->get_epoch();
+ }
+ const vector<int> &get_acting() {
+ return acting;
+ }
+ std::string gen_dbg_prefix() const { return gen_prefix(); }
+
+ const map<hobject_t, set<int> > &get_missing_loc() {
+ return missing_loc;
+ }
+ const map<int, pg_missing_t> &get_peer_missing() {
+ return peer_missing;
+ }
+ const map<int, pg_info_t> &get_peer_info() {
+ return peer_info;
+ }
+ const pg_missing_t &get_local_missing() {
+ return pg_log.get_missing();
+ }
+ const PGLog &get_log() {
+ return pg_log;
+ }
+ bool pgb_is_primary() const {
+ return is_primary();
+ }
+ OSDMapRef pgb_get_osdmap() const {
+ return get_osdmap();
+ }
+ const pg_info_t &get_info() const {
+ return info;
+ }
+ ObjectContextRef get_obc(
+ const hobject_t &hoid,
+ map<string, bufferptr> &attrs) {
+ return get_object_context(hoid, true, &attrs);
+ }
+
/*
* Capture all object state associated with an in-progress read or write.
*/
@@ -140,6 +327,7 @@ public:
bool modify; // (force) modification (even if op_t is empty)
bool user_modify; // user-visible modification
+ bool undirty; // user explicitly un-dirtying this object
// side effects
list<watch_info_t> watch_connects;
@@ -182,7 +370,7 @@ public:
int num_read; ///< count read ops
int num_write; ///< count update ops
- CopyOpRef copy_op;
+ CopyFromCallback *copy_cb;
hobject_t new_temp_oid, discard_temp_oid; ///< temp objects we should start/stop tracking
@@ -194,12 +382,13 @@ public:
ReplicatedPG *_pg) :
op(_op), reqid(_reqid), ops(_ops), obs(_obs), snapset(0),
new_obs(_obs->oi, _obs->exists),
- modify(false), user_modify(false),
+ modify(false), user_modify(false), undirty(false),
bytes_written(0), bytes_read(0), user_at_version(0),
current_osd_subop_num(0),
data_off(0), reply(NULL), pg(_pg),
num_read(0),
- num_write(0) {
+ num_write(0),
+ copy_cb(NULL) {
if (_ssc) {
new_snapset = _ssc->snapset;
snapset = &_ssc->snapset;
@@ -339,7 +528,11 @@ public:
protected:
ObjectContextRef create_object_context(const object_info_t& oi, SnapSetContext *ssc);
- ObjectContextRef get_object_context(const hobject_t& soid, bool can_create);
+ ObjectContextRef get_object_context(
+ const hobject_t& soid,
+ bool can_create,
+ map<string, bufferptr> *attrs = 0
+ );
void context_registry_on_change();
void object_context_destructor_callback(ObjectContext *obc);
@@ -362,8 +555,11 @@ protected:
void get_src_oloc(const object_t& oid, const object_locator_t& oloc, object_locator_t& src_oloc);
SnapSetContext *create_snapset_context(const object_t& oid);
- SnapSetContext *get_snapset_context(const object_t& oid, const string &key,
- ps_t seed, bool can_create, const string &nspace);
+ SnapSetContext *get_snapset_context(
+ const object_t& oid, const string &key,
+ ps_t seed, bool can_create, const string &nspace,
+ map<string, bufferptr> *attrs = 0
+ );
void register_snapset_context(SnapSetContext *ssc) {
Mutex::Locker l(snapset_contexts_lock);
_register_snapset_context(ssc);
@@ -378,90 +574,7 @@ protected:
}
void put_snapset_context(SnapSetContext *ssc);
- // push
- struct PushInfo {
- ObjectRecoveryProgress recovery_progress;
- ObjectRecoveryInfo recovery_info;
- int priority;
-
- void dump(Formatter *f) const {
- {
- f->open_object_section("recovery_progress");
- recovery_progress.dump(f);
- f->close_section();
- }
- {
- f->open_object_section("recovery_info");
- recovery_info.dump(f);
- f->close_section();
- }
- }
- };
- map<hobject_t, map<int, PushInfo> > pushing;
-
- // pull
- struct PullInfo {
- ObjectRecoveryProgress recovery_progress;
- ObjectRecoveryInfo recovery_info;
- int priority;
-
- void dump(Formatter *f) const {
- {
- f->open_object_section("recovery_progress");
- recovery_progress.dump(f);
- f->close_section();
- }
- {
- f->open_object_section("recovery_info");
- recovery_info.dump(f);
- f->close_section();
- }
- }
-
- bool is_complete() const {
- return recovery_progress.is_complete(recovery_info);
- }
- };
- map<hobject_t, PullInfo> pulling;
-
- ObjectRecoveryInfo recalc_subsets(const ObjectRecoveryInfo& recovery_info);
- static void trim_pushed_data(const interval_set<uint64_t> &copy_subset,
- const interval_set<uint64_t> &intervals_received,
- bufferlist data_received,
- interval_set<uint64_t> *intervals_usable,
- bufferlist *data_usable);
- bool handle_pull_response(
- int from, PushOp &op, PullOp *response,
- ObjectStore::Transaction *t);
- void handle_push(
- int from, PushOp &op, PushReplyOp *response,
- ObjectStore::Transaction *t);
- void send_pushes(int prio, map<int, vector<PushOp> > &pushes);
- int send_push(int priority, int peer,
- const ObjectRecoveryInfo& recovery_info,
- const ObjectRecoveryProgress &progress,
- ObjectRecoveryProgress *out_progress = 0);
- int build_push_op(const ObjectRecoveryInfo &recovery_info,
- const ObjectRecoveryProgress &progress,
- ObjectRecoveryProgress *out_progress,
- PushOp *out_op);
- int send_push_op_legacy(int priority, int peer,
- PushOp &pop);
-
- int send_pull_legacy(int priority, int peer,
- const ObjectRecoveryInfo& recovery_info,
- ObjectRecoveryProgress progress);
- void submit_push_data(ObjectRecoveryInfo &recovery_info,
- bool first,
- bool complete,
- const interval_set<uint64_t> &intervals_included,
- bufferlist data_included,
- bufferlist omap_header,
- map<string, bufferptr> &attrs,
- map<string, bufferlist> &omap_entries,
- ObjectStore::Transaction *t);
- void submit_push_complete(ObjectRecoveryInfo &recovery_info,
- ObjectStore::Transaction *t);
+ set<hobject_t> recovering;
/*
* Backfill
@@ -504,54 +617,17 @@ protected:
f->close_section();
}
{
- f->open_array_section("pull_from_peer");
- for (map<int, set<hobject_t> >::const_iterator i = pull_from_peer.begin();
- i != pull_from_peer.end();
+ f->open_array_section("recovering");
+ for (set<hobject_t>::const_iterator i = recovering.begin();
+ i != recovering.end();
++i) {
- f->open_object_section("pulling_from");
- f->dump_int("pull_from", i->first);
- {
- f->open_array_section("pulls");
- for (set<hobject_t>::const_iterator j = i->second.begin();
- j != i->second.end();
- ++j) {
- f->open_object_section("pull_info");
- assert(pulling.count(*j));
- pulling.find(*j)->second.dump(f);
- f->close_section();
- }
- f->close_section();
- }
- f->close_section();
+ f->dump_stream("object") << *i;
}
f->close_section();
}
{
- f->open_array_section("pushing");
- for (map<hobject_t, map<int, PushInfo> >::const_iterator i =
- pushing.begin();
- i != pushing.end();
- ++i) {
- f->open_object_section("object");
- f->dump_stream("pushing") << i->first;
- {
- f->open_array_section("pushing_to");
- for (map<int, PushInfo>::const_iterator j = i->second.begin();
- j != i->second.end();
- ++j) {
- f->open_object_section("push_progress");
- f->dump_stream("object_pushing") << j->first;
- {
- f->open_object_section("push_info");
- j->second.dump(f);
- f->close_section();
- }
- f->close_section();
- }
- f->close_section();
- }
- f->close_section();
- }
+ f->open_object_section("pg_backend");
+ pgbackend->dump_recovery_info(f);
f->close_section();
}
}
@@ -559,53 +635,19 @@ protected:
/// leading edge of backfill
hobject_t backfill_pos;
- // Reverse mapping from osd peer to objects beging pulled from that peer
- map<int, set<hobject_t> > pull_from_peer;
-
int prep_object_replica_pushes(const hobject_t& soid, eversion_t v,
- int priority,
- map<int, vector<PushOp> > *pushes);
- void calc_head_subsets(ObjectContextRef obc, SnapSet& snapset, const hobject_t& head,
- pg_missing_t& missing,
- const hobject_t &last_backfill,
- interval_set<uint64_t>& data_subset,
- map<hobject_t, interval_set<uint64_t> >& clone_subsets);
- void calc_clone_subsets(SnapSet& snapset, const hobject_t& poid, const pg_missing_t& missing,
- const hobject_t &last_backfill,
- interval_set<uint64_t>& data_subset,
- map<hobject_t, interval_set<uint64_t> >& clone_subsets);
- void prep_push_to_replica(
- ObjectContextRef obc,
- const hobject_t& oid,
- int dest,
- int priority,
- PushOp *push_op);
- void prep_push(int priority,
- ObjectContextRef obc,
- const hobject_t& oid, int dest,
- PushOp *op);
- void prep_push(int priority,
- ObjectContextRef obc,
- const hobject_t& soid, int peer,
- eversion_t version,
- interval_set<uint64_t> &data_subset,
- map<hobject_t, interval_set<uint64_t> >& clone_subsets,
- PushOp *op);
- void prep_push_op_blank(const hobject_t& soid, PushOp *op);
+ PGBackend::RecoveryHandle *h);
void finish_degraded_object(const hobject_t& oid);
// Cancels/resets pulls from peer
void check_recovery_sources(const OSDMapRef map);
- void send_pulls(
- int priority,
- map<int, vector<PullOp> > &pulls);
- int prepare_pull(
- const hobject_t& oid, eversion_t v,
+ int recover_missing(
+ const hobject_t& oid,
+ eversion_t v,
int priority,
- map<int, vector<PullOp> > *pulls
- );
+ PGBackend::RecoveryHandle *h);
// low level ops
@@ -651,13 +693,19 @@ protected:
* @bi [out] resulting map of objects to eversion_t's
*/
void scan_range(
- hobject_t begin, int min, int max, BackfillInterval *bi,
+ int min, int max, BackfillInterval *bi,
ThreadPool::TPHandle &handle
);
+ /// Update a hash range to reflect changes since the last scan
+ void update_range(
+ BackfillInterval *bi, ///< [in,out] interval to update
+ ThreadPool::TPHandle &handle ///< [in] tp handle
+ );
+
void prep_backfill_object_push(
hobject_t oid, eversion_t v, eversion_t have, int peer,
- map<int, vector<PushOp> > *pushes);
+ PGBackend::RecoveryHandle *h);
void send_remove_op(const hobject_t& oid, eversion_t v, int peer);
@@ -694,12 +742,17 @@ protected:
}
};
struct C_OSD_OndiskWriteUnlock : public Context {
- ObjectContextRef obc, obc2;
- C_OSD_OndiskWriteUnlock(ObjectContextRef o, ObjectContextRef o2 = ObjectContextRef()) : obc(o), obc2(o2) {}
+ ObjectContextRef obc, obc2, obc3;
+ C_OSD_OndiskWriteUnlock(
+ ObjectContextRef o,
+ ObjectContextRef o2 = ObjectContextRef(),
+ ObjectContextRef o3 = ObjectContextRef()) : obc(o), obc2(o2), obc3(o3) {}
void finish(int r) {
obc->ondisk_write_unlock();
if (obc2)
obc2->ondisk_write_unlock();
+ if (obc3)
+ obc3->ondisk_write_unlock();
}
};
struct C_OSD_OndiskWriteUnlockList : public Context {
@@ -731,35 +784,6 @@ protected:
pg->_committed_pushed_object(epoch, last_complete);
}
};
- struct C_OSD_SendMessageOnConn: public Context {
- OSDService *osd;
- Message *reply;
- ConnectionRef conn;
- C_OSD_SendMessageOnConn(
- OSDService *osd,
- Message *reply,
- ConnectionRef conn) : osd(osd), reply(reply), conn(conn) {}
- void finish(int) {
- osd->send_message_osd_cluster(reply, conn.get());
- }
- };
- struct C_OSD_CompletedPull : public Context {
- ReplicatedPGRef pg;
- hobject_t hoid;
- epoch_t epoch;
- C_OSD_CompletedPull(
- ReplicatedPG *pg,
- const hobject_t &hoid,
- epoch_t epoch) : pg(pg), hoid(hoid), epoch(epoch) {}
- void finish(int) {
- pg->lock();
- if (!pg->pg_has_reset_since(epoch)) {
- pg->finish_recovery_op(hoid);
- }
- pg->unlock();
- }
- };
- friend struct C_OSD_CompletedPull;
struct C_OSD_AppliedRecoveredObjectReplica : public Context {
ReplicatedPGRef pg;
C_OSD_AppliedRecoveredObjectReplica(ReplicatedPG *p) :
@@ -780,26 +804,21 @@ protected:
void _applied_recovered_object_replica();
void _committed_pushed_object(epoch_t epoch, eversion_t lc);
void recover_got(hobject_t oid, eversion_t v);
- void sub_op_push(OpRequestRef op);
- void _failed_push(int from, const hobject_t &soid);
- void sub_op_push_reply(OpRequestRef op);
- bool handle_push_reply(int peer, PushReplyOp &op, PushOp *reply);
- void sub_op_pull(OpRequestRef op);
- void handle_pull(int peer, PullOp &op, PushOp *reply);
-
- void log_subop_stats(OpRequestRef op, int tag_inb, int tag_lat);
// -- copyfrom --
map<hobject_t, CopyOpRef> copy_ops;
- int start_copy(OpContext *ctx, hobject_t src, object_locator_t oloc, version_t version,
- CopyOpRef *pcop);
+ int start_copy(CopyCallback *cb, ObjectContextRef obc, hobject_t src,
+ object_locator_t oloc, version_t version,
+ const hobject_t& temp_dest_oid);
void process_copy_chunk(hobject_t oid, tid_t tid, int r);
void _write_copy_chunk(CopyOpRef cop, ObjectStore::Transaction *t);
- void _copy_some(OpContext *ctx, CopyOpRef cop);
- int finish_copy(OpContext *ctx);
+ void _copy_some(ObjectContextRef obc, CopyOpRef cop);
+ void _build_finish_copy_transaction(CopyOpRef cop,
+ ObjectStore::Transaction& t);
+ int finish_copyfrom(OpContext *ctx);
void cancel_copy(CopyOpRef cop);
- void requeue_cancel_copy_ops(bool requeue=true);
+ void cancel_copy_ops();
friend class C_Copyfrom;
@@ -828,6 +847,9 @@ public:
int do_command(cmdmap_t cmdmap, ostream& ss, bufferlist& idata,
bufferlist& odata);
+ void do_request(
+ OpRequestRef op,
+ ThreadPool::TPHandle &handle);
void do_op(OpRequestRef op);
bool pg_op_must_wait(MOSDOp *op);
void do_pg_op(OpRequestRef op);
@@ -837,17 +859,7 @@ public:
OpRequestRef op,
ThreadPool::TPHandle &handle);
void do_backfill(OpRequestRef op);
- void _do_push(OpRequestRef op);
- void _do_pull_response(OpRequestRef op);
- void do_push(OpRequestRef op) {
- if (is_primary()) {
- _do_pull_response(op);
- } else {
- _do_push(op);
- }
- }
- void do_pull(OpRequestRef op);
- void do_push_reply(OpRequestRef op);
+
RepGather *trim_object(const hobject_t &coid);
void snap_trimmer();
int do_osd_ops(OpContext *ctx, vector<OSDOp>& ops);
@@ -857,16 +869,27 @@ public:
void do_osd_op_effects(OpContext *ctx);
private:
- bool temp_created;
- coll_t temp_coll;
- set<hobject_t> temp_contents; ///< contents of temp collection, clear on reset
uint64_t temp_seq; ///< last id for naming temp objects
coll_t get_temp_coll(ObjectStore::Transaction *t);
hobject_t generate_temp_object(); ///< generate a new temp object name
public:
- bool have_temp_coll();
- coll_t get_temp_coll() {
- return temp_coll;
+ void get_colls(list<coll_t> *out) {
+ out->push_back(coll);
+ return pgbackend->temp_colls(out);
+ }
+ void split_colls(
+ pg_t child,
+ int split_bits,
+ int seed,
+ ObjectStore::Transaction *t) {
+ coll_t target = coll_t(child);
+ t->create_collection(target);
+ t->split_collection(
+ coll,
+ split_bits,
+ seed,
+ target);
+ pgbackend->split_colls(child, split_bits, seed, t);
}
private:
struct NotTrimming;
@@ -922,7 +945,6 @@ private:
int _get_tmap(OpContext *ctx, map<string, bufferlist> *out,
bufferlist *header);
- int _copy_up_tmap(OpContext *ctx);
int _delete_head(OpContext *ctx);
int _rollback_to(OpContext *ctx, ceph_osd_op& op);
public:
@@ -952,7 +974,10 @@ public:
void on_role_change();
void on_change(ObjectStore::Transaction *t);
void on_activate();
- void on_flushed();
+ void on_flushed() {
+ assert(object_contexts.empty());
+ pgbackend->on_flushed();
+ }
void on_removal(ObjectStore::Transaction *t);
void on_shutdown();
};
@@ -968,7 +993,7 @@ inline ostream& operator<<(ostream& out, ReplicatedPG::RepGather& repop)
//<< " wfnvram=" << repop.waitfor_nvram
<< " wfdisk=" << repop.waitfor_disk;
if (repop.ctx->op)
- out << " op=" << *(repop.ctx->op->request);
+ out << " op=" << *(repop.ctx->op->get_req());
out << ")";
return out;
}
diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc
index aa20dc592fa..27f7b171677 100644
--- a/src/osd/osd_types.cc
+++ b/src/osd/osd_types.cc
@@ -2796,9 +2796,8 @@ void object_info_t::copy_user_bits(const object_info_t& other)
last_reqid = other.last_reqid;
truncate_seq = other.truncate_seq;
truncate_size = other.truncate_size;
- lost = other.lost;
+ flags = other.flags;
category = other.category;
- uses_tmap = other.uses_tmap;
}
ps_t object_info_t::legacy_object_locator_to_ps(const object_t &oid,
@@ -2824,7 +2823,7 @@ void object_info_t::encode(bufferlist& bl) const
++i) {
old_watchers.insert(make_pair(i->first.second, i->second));
}
- ENCODE_START(11, 8, bl);
+ ENCODE_START(12, 8, bl);
::encode(soid, bl);
::encode(myoloc, bl); //Retained for compatibility
::encode(category, bl);
@@ -2839,13 +2838,15 @@ void object_info_t::encode(bufferlist& bl) const
::encode(snaps, bl);
::encode(truncate_seq, bl);
::encode(truncate_size, bl);
- ::encode(lost, bl);
+ __u8 flags_lo = flags & 0xff;
+ __u8 flags_hi = (flags & 0xff00) >> 8;
+ ::encode(flags_lo, bl);
::encode(old_watchers, bl);
/* shenanigans to avoid breaking backwards compatibility in the disk format.
* When we can, switch this out for simply putting the version_t on disk. */
eversion_t user_eversion(0, user_version);
::encode(user_eversion, bl);
- ::encode(uses_tmap, bl);
+ ::encode(flags_hi, bl);
::encode(watchers, bl);
ENCODE_FINISH(bl);
}
@@ -2853,7 +2854,7 @@ void object_info_t::encode(bufferlist& bl) const
void object_info_t::decode(bufferlist::iterator& bl)
{
object_locator_t myoloc;
- DECODE_START_LEGACY_COMPAT_LEN(11, 8, 8, bl);
+ DECODE_START_LEGACY_COMPAT_LEN(12, 8, 8, bl);
map<entity_name_t, watch_info_t> old_watchers;
if (struct_v >= 2 && struct_v <= 5) {
sobject_t obj;
@@ -2883,20 +2884,26 @@ void object_info_t::decode(bufferlist::iterator& bl)
::decode(snaps, bl);
::decode(truncate_seq, bl);
::decode(truncate_size, bl);
- if (struct_v >= 3)
- ::decode(lost, bl);
- else
- lost = false;
+ if (struct_v >= 3) {
+ __u8 lo;
+ ::decode(lo, bl);
+ flags = (flag_t)lo;
+ } else {
+ flags = (flag_t)0;
+ }
if (struct_v >= 4) {
::decode(old_watchers, bl);
eversion_t user_eversion;
::decode(user_eversion, bl);
user_version = user_eversion.version;
}
- if (struct_v >= 9)
- ::decode(uses_tmap, bl);
- else
- uses_tmap = true;
+ if (struct_v >= 9) {
+ __u8 hi;
+ ::decode(hi, bl);
+ flags = (flag_t)(flags | ((unsigned)hi << 8));
+ } else {
+ set_flag(FLAG_USES_TMAP);
+ }
if (struct_v < 10)
soid.pool = myoloc.pool;
if (struct_v >= 11) {
@@ -2924,7 +2931,8 @@ void object_info_t::dump(Formatter *f) const
f->dump_stream("last_reqid") << last_reqid;
f->dump_unsigned("size", size);
f->dump_stream("mtime") << mtime;
- f->dump_unsigned("lost", lost);
+ f->dump_unsigned("lost", (int)is_lost());
+ f->dump_unsigned("flags", (int)flags);
f->dump_stream("wrlock_by") << wrlock_by;
f->open_array_section("snaps");
for (vector<snapid_t>::const_iterator p = snaps.begin(); p != snaps.end(); ++p)
@@ -2960,8 +2968,8 @@ ostream& operator<<(ostream& out, const object_info_t& oi)
out << " wrlock_by=" << oi.wrlock_by;
else
out << " " << oi.snaps;
- if (oi.lost)
- out << " LOST";
+ if (oi.flags)
+ out << " " << oi.get_flag_string();
out << ")";
return out;
}
@@ -3515,6 +3523,8 @@ ostream& operator<<(ostream& out, const OSDOp& op)
case CEPH_OSD_OP_DELETE:
case CEPH_OSD_OP_LIST_WATCHERS:
case CEPH_OSD_OP_LIST_SNAPS:
+ case CEPH_OSD_OP_UNDIRTY:
+ case CEPH_OSD_OP_ISDIRTY:
break;
case CEPH_OSD_OP_ASSERT_VER:
out << " v" << op.op.assert_ver.ver;
diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h
index 091b2b95e8f..a54fc65f375 100644
--- a/src/osd/osd_types.h
+++ b/src/osd/osd_types.h
@@ -23,6 +23,7 @@
#include "include/types.h"
#include "include/utime.h"
#include "include/CompatSet.h"
+#include "include/histogram.h"
#include "include/interval_set.h"
#include "common/snap_types.h"
#include "common/Formatter.h"
@@ -41,10 +42,12 @@
#define CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO CompatSet::Feature(8, "leveldbinfo")
#define CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG CompatSet::Feature(9, "leveldblog")
#define CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER CompatSet::Feature(10, "snapmapper")
+#define CEPH_OSD_FEATURE_INCOMPAT_SHARDS CompatSet::Feature(11, "sharded objects")
typedef hobject_t collection_list_handle_t;
+typedef uint8_t shard_id_t;
/**
* osd request identifier
@@ -553,67 +556,6 @@ inline ostream& operator<<(ostream& out, const eversion_t e) {
return out << e.epoch << "'" << e.version;
}
-
-/**
- * power of 2 histogram
- */
-struct pow2_hist_t {
- /**
- * histogram
- *
- * bin size is 2^index
- * value is count of elements that are <= the current bin but > the previous bin.
- */
- vector<int32_t> h;
-
-private:
- /// expand to at least another's size
- void _expand_to(unsigned s) {
- if (s > h.size())
- h.resize(s, 0);
- }
- /// drop useless trailing 0's
- void _contract() {
- unsigned p = h.size();
- while (p > 0 && h[p-1] == 0)
- --p;
- h.resize(p);
- }
-
-public:
- void clear() {
- h.clear();
- }
- void set(int bin, int32_t v) {
- _expand_to(bin + 1);
- h[bin] = v;
- _contract();
- }
-
- void add(const pow2_hist_t& o) {
- _expand_to(o.h.size());
- for (unsigned p = 0; p < o.h.size(); ++p)
- h[p] += o.h[p];
- _contract();
- }
- void sub(const pow2_hist_t& o) {
- _expand_to(o.h.size());
- for (unsigned p = 0; p < o.h.size(); ++p)
- h[p] -= o.h[p];
- _contract();
- }
-
- int32_t upper_bound() const {
- return 1 << h.size();
- }
-
- void dump(Formatter *f) const;
- void encode(bufferlist &bl) const;
- void decode(bufferlist::iterator &bl);
- static void generate_test_instances(std::list<pow2_hist_t*>& o);
-};
-WRITE_CLASS_ENCODER(pow2_hist_t)
-
/**
* filestore_perf_stat_t
*
@@ -2091,22 +2033,68 @@ struct object_info_t {
uint64_t size;
utime_t mtime;
- bool lost;
+
+ // note: these are currently encoded into a total 16 bits; see
+ // encode()/decode() for the weirdness.
+ typedef enum {
+ FLAG_LOST = 1<<0,
+ FLAG_WHITEOUT = 1<<1, // object logically does not exist
+ FLAG_DIRTY = 1<<2, // object has been modified since last flushed or undirtied
+ // ...
+ FLAG_USES_TMAP = 1<<8, // deprecated; no longer used.
+ } flag_t;
+
+ flag_t flags;
+
+ static string get_flag_string(flag_t flags) {
+ string s;
+ if (flags & FLAG_LOST)
+ s += "|lost";
+ if (flags & FLAG_WHITEOUT)
+ s += "|whiteout";
+ if (flags & FLAG_DIRTY)
+ s += "|dirty";
+ if (flags & FLAG_USES_TMAP)
+ s += "|uses_tmap";
+ if (s.length())
+ return s.substr(1);
+ return s;
+ }
+ string get_flag_string() const {
+ return get_flag_string(flags);
+ }
osd_reqid_t wrlock_by; // [head]
vector<snapid_t> snaps; // [clone]
uint64_t truncate_seq, truncate_size;
-
map<pair<uint64_t, entity_name_t>, watch_info_t> watchers;
- bool uses_tmap;
void copy_user_bits(const object_info_t& other);
static ps_t legacy_object_locator_to_ps(const object_t &oid,
const object_locator_t &loc);
+ bool test_flag(flag_t f) const {
+ return (flags & f) == f;
+ }
+ void set_flag(flag_t f) {
+ flags = (flag_t)(flags | f);
+ }
+ void clear_flag(flag_t f) {
+ flags = (flag_t)(flags & ~f);
+ }
+ bool is_lost() const {
+ return test_flag(FLAG_LOST);
+ }
+ bool is_whiteout() const {
+ return test_flag(FLAG_WHITEOUT);
+ }
+ bool is_dirty() const {
+ return test_flag(FLAG_DIRTY);
+ }
+
void encode(bufferlist& bl) const;
void decode(bufferlist::iterator& bl);
void decode(bufferlist& bl) {
@@ -2117,13 +2105,14 @@ struct object_info_t {
static void generate_test_instances(list<object_info_t*>& o);
explicit object_info_t()
- : user_version(0), size(0), lost(false),
- truncate_seq(0), truncate_size(0), uses_tmap(false)
+ : user_version(0), size(0), flags((flag_t)0),
+ truncate_seq(0), truncate_size(0)
{}
object_info_t(const hobject_t& s)
- : soid(s), user_version(0), size(0),
- lost(false), truncate_seq(0), truncate_size(0), uses_tmap(false) {}
+ : soid(s),
+ user_version(0), size(0), flags((flag_t)0),
+ truncate_seq(0), truncate_size(0) {}
object_info_t(bufferlist& bl) {
decode(bl);
@@ -2133,7 +2122,7 @@ WRITE_CLASS_ENCODER(object_info_t)
struct ObjectState {
object_info_t oi;
- bool exists;
+ bool exists; ///< the stored object exists (i.e., we will remember the object_info_t)
ObjectState() : exists(false) {}
diff --git a/src/osdc/ObjectCacher.cc b/src/osdc/ObjectCacher.cc
index 590b5d473a8..81335b7957f 100644
--- a/src/osdc/ObjectCacher.cc
+++ b/src/osdc/ObjectCacher.cc
@@ -11,6 +11,8 @@
#include "include/assert.h"
+#define MAX_FLUSH_UNDER_LOCK 20 ///< max bh's we start writeback on while holding the lock
+
/*** ObjectCacher::BufferHead ***/
@@ -1448,8 +1450,10 @@ void ObjectCacher::flusher_entry()
utime_t cutoff = ceph_clock_now(cct);
cutoff -= max_dirty_age;
BufferHead *bh = 0;
+ int max = MAX_FLUSH_UNDER_LOCK;
while ((bh = static_cast<BufferHead*>(bh_lru_dirty.lru_get_next_expire())) != 0 &&
- bh->last_write < cutoff) {
+ bh->last_write < cutoff &&
+ --max > 0) {
ldout(cct, 10) << "flusher flushing aged dirty bh " << *bh << dendl;
bh_write(bh);
}
diff --git a/src/osdc/Objecter.cc b/src/osdc/Objecter.cc
index 6c0486ce801..d2c574d982e 100644
--- a/src/osdc/Objecter.cc
+++ b/src/osdc/Objecter.cc
@@ -1338,7 +1338,7 @@ int Objecter::recalc_op_target(Op *op)
need_check_tiering = true;
}
- if (need_check_tiering) {
+ if (honor_cache_redirects && need_check_tiering) {
const pg_pool_t *pi = osdmap->get_pg_pool(op->base_oloc.pool);
if (pi) {
if (is_read && pi->has_read_tier())
diff --git a/src/osdc/Objecter.h b/src/osdc/Objecter.h
index 880023ab37b..938c97a4f31 100644
--- a/src/osdc/Objecter.h
+++ b/src/osdc/Objecter.h
@@ -386,7 +386,6 @@ struct ObjectOperation {
pwatchers->push_back(ow);
}
}
- *prval = 0;
}
catch (buffer::error& e) {
if (prval)
@@ -424,8 +423,6 @@ struct ObjectOperation {
}
psnaps->seq = resp.seq;
}
- if (prval)
- *prval = 0;
}
catch (buffer::error& e) {
if (prval)
@@ -643,6 +640,42 @@ struct ObjectOperation {
out_handler[p] = h;
}
+ void undirty() {
+ add_op(CEPH_OSD_OP_UNDIRTY);
+ }
+
+ struct C_ObjectOperation_isdirty : public Context {
+ bufferlist bl;
+ bool *pisdirty;
+ int *prval;
+ C_ObjectOperation_isdirty(bool *p, int *r)
+ : pisdirty(p), prval(r) {}
+ void finish(int r) {
+ if (r < 0)
+ return;
+ try {
+ bufferlist::iterator p = bl.begin();
+ bool isdirty;
+ ::decode(isdirty, p);
+ if (pisdirty)
+ *pisdirty = isdirty;
+ } catch (buffer::error& e) {
+ if (prval)
+ *prval = -EIO;
+ }
+ }
+ };
+
+ void is_dirty(bool *pisdirty, int *prval) {
+ add_op(CEPH_OSD_OP_ISDIRTY);
+ unsigned p = ops.size() - 1;
+ out_rval[p] = prval;
+ C_ObjectOperation_isdirty *h =
+ new C_ObjectOperation_isdirty(pisdirty, prval);
+ out_bl[p] = &h->bl;
+ out_handler[p] = h;
+ }
+
void omap_get_header(bufferlist *bl, int *prval) {
add_op(CEPH_OSD_OP_OMAPGETHEADER);
unsigned p = ops.size() - 1;
@@ -785,6 +818,7 @@ class Objecter {
int global_op_flags; // flags which are applied to each IO op
bool keep_balanced_budget;
bool honor_osdmap_full;
+ bool honor_cache_redirects;
void maybe_request_map();
@@ -1260,6 +1294,7 @@ public:
num_unacked(0), num_uncommitted(0),
global_op_flags(0),
keep_balanced_budget(false), honor_osdmap_full(true),
+ honor_cache_redirects(true),
last_seen_osdmap_version(0),
last_seen_pgmap_version(0),
client_lock(l), timer(t),
@@ -1293,6 +1328,9 @@ public:
void set_honor_osdmap_full() { honor_osdmap_full = true; }
void unset_honor_osdmap_full() { honor_osdmap_full = false; }
+ void set_honor_cache_redirects() { honor_cache_redirects = true; }
+ void unset_honor_cache_redirects() { honor_cache_redirects = false; }
+
void scan_requests(bool skipped_map,
map<tid_t, Op*>& need_resend,
list<LingerOp*>& need_resend_linger,
diff --git a/src/pybind/ceph_argparse.py b/src/pybind/ceph_argparse.py
index 427a4621216..1f6e90b6c1d 100644
--- a/src/pybind/ceph_argparse.py
+++ b/src/pybind/ceph_argparse.py
@@ -275,12 +275,26 @@ class CephIPAddr(CephArgtype):
class CephEntityAddr(CephIPAddr):
"""
- EntityAddress, that is, IP address/nonce
+ EntityAddress, that is, IP address[/nonce]
"""
def valid(self, s, partial=False):
- ip, nonce = s.split('/')
+ nonce = None
+ if '/' in s:
+ ip, nonce = s.split('/')
+ else:
+ ip = s
super(self.__class__, self).valid(ip)
- self.nonce = nonce
+ if nonce:
+ nonce_long = None
+ try:
+ nonce_long = long(nonce)
+ except ValueError:
+ pass
+ if nonce_long is None or nonce_long < 0:
+ raise ArgumentValid(
+ '{0}: invalid entity, nonce {1} not integer > 0'.\
+ format(s, nonce)
+ )
self.val = s
def __str__(self):
@@ -829,6 +843,11 @@ def validate(args, signature, partial=False):
# wanted n, got too few
if partial:
return d
+ # special-case the "0 expected 1" case
+ if desc.numseen == 0 and desc.n == 1:
+ raise ArgumentNumber(
+ 'missing required parameter {0}'.format(desc)
+ )
raise ArgumentNumber(
'saw {0} of {1}, expected {2}'.\
format(desc.numseen, desc, desc.n)
@@ -937,6 +956,7 @@ def validate_command(sigdict, args, verbose=False):
# Stop now, because we have the right command but
# some other input is invalid
print >> sys.stderr, "Invalid command: ", str(e)
+ print >> sys.stderr, concise_sig(sig), ': ', cmd['help']
return {}
if found:
break
diff --git a/src/pybind/ceph_rest_api.py b/src/pybind/ceph_rest_api.py
index c53c3d77737..75e61060544 100755
--- a/src/pybind/ceph_rest_api.py
+++ b/src/pybind/ceph_rest_api.py
@@ -1,4 +1,3 @@
-#!/usr/bin/python
# vim: ts=4 sw=4 smarttab expandtab
import errno
diff --git a/src/rbd.cc b/src/rbd.cc
index eea9733c4b9..147eb2c5138 100644
--- a/src/rbd.cc
+++ b/src/rbd.cc
@@ -68,6 +68,7 @@ static string dir_info_oid = RBD_INFO;
bool udevadm_settle = true;
bool progress = true;
bool resize_allow_shrink = false;
+bool read_only = false;
#define dout_subsys ceph_subsys_rbd
@@ -151,6 +152,7 @@ void usage()
" --pretty-format make json or xml output more readable\n"
" --no-settle do not wait for udevadm to settle on map/unmap\n"
" --no-progress do not show progress for long-running commands\n"
+" --read-only set device readonly when mapping image\n"
" --allow-shrink allow shrinking of an image when resizing\n";
}
@@ -1640,8 +1642,13 @@ static int do_kernel_add(const char *poolname, const char *imgname,
oss << ",";
}
+ if (read_only)
+ oss << " ro";
+ else
+ oss << " rw";
+
const char *user = g_conf->name.get_id().c_str();
- oss << " name=" << user;
+ oss << ",name=" << user;
char key_name[strlen(user) + strlen("client.") + 1];
snprintf(key_name, sizeof(key_name), "client.%s", user);
@@ -2200,6 +2207,8 @@ int main(int argc, const char **argv)
lock_tag = strdup(val.c_str());
} else if (ceph_argparse_flag(args, i, "--no-settle", (char *)NULL)) {
udevadm_settle = false;
+ } else if (ceph_argparse_flag(args, i, "--read-only", (char *)NULL)) {
+ read_only = true;
} else if (ceph_argparse_flag(args, i, "--no-progress", (char *)NULL)) {
progress = false;
} else if (ceph_argparse_flag(args, i , "--allow-shrink", (char *)NULL)) {
@@ -2247,7 +2256,7 @@ int main(int argc, const char **argv)
opt_cmd = get_cmd(*i, false, false);
}
if (opt_cmd == OPT_NO_CMD) {
- cerr << "rbd: error parsing command '" << *i << "'" << std::endl;
+ cerr << "rbd: error parsing command '" << *i << "'; -h or --help for usage" << std::endl;
return EXIT_FAILURE;
}
diff --git a/src/rbd_fuse/rbd-fuse.c b/src/rbd_fuse/rbd-fuse.c
index eea6edb9eb8..2a6a8d22e81 100644
--- a/src/rbd_fuse/rbd-fuse.c
+++ b/src/rbd_fuse/rbd-fuse.c
@@ -1,7 +1,7 @@
/*
* rbd-fuse
*/
-#define FUSE_USE_VERSION 26
+#define FUSE_USE_VERSION 30
#include "include/int_types.h"
diff --git a/src/rgw/rgw_metadata.cc b/src/rgw/rgw_metadata.cc
index b22d5804569..23f73e26531 100644
--- a/src/rgw/rgw_metadata.cc
+++ b/src/rgw/rgw_metadata.cc
@@ -388,6 +388,8 @@ int RGWMetadataManager::remove(string& metadata_key)
objv_tracker.read_version = obj->get_version();
+ delete obj;
+
return handler->remove(store, entry, objv_tracker);
}
diff --git a/src/rgw/rgw_rados.cc b/src/rgw/rgw_rados.cc
index 811c7ee57cc..9f0a900f3d3 100644
--- a/src/rgw/rgw_rados.cc
+++ b/src/rgw/rgw_rados.cc
@@ -902,7 +902,9 @@ int RGWRados::init_complete()
ret = region_map.read(cct, this);
if (ret < 0) {
- ldout(cct, 0) << "WARNING: cannot read region map" << dendl;
+ if (ret != -ENOENT) {
+ ldout(cct, 0) << "WARNING: cannot read region map" << dendl;
+ }
ret = region_map.update(region);
if (ret < 0) {
ldout(cct, 0) << "ERROR: failed to update regionmap with local region info" << dendl;
@@ -2189,8 +2191,8 @@ int RGWRados::create_pools(vector<string>& names, vector<int>& retcodes)
if (r < 0) {
ldout(cct, 0) << "WARNING: async pool_create returned " << r << dendl;
}
- c->release();
}
+ c->release();
retcodes.push_back(r);
}
return 0;
@@ -2647,7 +2649,6 @@ int RGWRados::copy_obj(void *ctx,
{ /* opening scope so that we can do goto, sorry */
bufferlist& extra_data_bl = processor.get_extra_data();
if (extra_data_bl.length()) {
- extra_data_bl.append((char)0);
JSONParser jp;
if (!jp.parse(extra_data_bl.c_str(), extra_data_bl.length())) {
ldout(cct, 0) << "failed to parse response extra data. len=" << extra_data_bl.length() << " data=" << extra_data_bl.c_str() << dendl;
diff --git a/src/script/perf-watch.py b/src/script/perf-watch.py
index 8c18c3ec766..826d4a499d7 100755
--- a/src/script/perf-watch.py
+++ b/src/script/perf-watch.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
import json
import argparse
diff --git a/src/test/Makefile.am b/src/test/Makefile.am
index 647aad3550d..59b4d89e930 100644
--- a/src/test/Makefile.am
+++ b/src/test/Makefile.am
@@ -65,6 +65,10 @@ endif
bin_PROGRAMS += ceph-dencoder
+get_command_descriptions_SOURCES = test/common/get_command_descriptions.cc
+get_command_descriptions_LDADD = $(LIBMON) $(LIBCOMMON) $(CEPH_GLOBAL)
+noinst_PROGRAMS += get_command_descriptions
+
## Build tests
# These should all use explicit _CXXFLAGS so avoid basename conflicts
@@ -228,6 +232,10 @@ bin_DEBUGPROGRAMS += ceph_bench_log
## Unit tests
+check_SCRIPTS += \
+ unittest_bufferlist.sh \
+ test/encoding/check-generated.sh
+
# target to build but not run the unit tests
unittests:: $(check_PROGRAMS)
@@ -250,11 +258,21 @@ unittest_addrs_CXXFLAGS = $(UNITTEST_CXXFLAGS)
unittest_addrs_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
check_PROGRAMS += unittest_addrs
+unittest_bloom_filter_SOURCES = test/common/test_bloom_filter.cc
+unittest_bloom_filter_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+unittest_bloom_filter_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
+check_PROGRAMS += unittest_bloom_filter
+
unittest_sharedptr_registry_SOURCES = test/common/test_sharedptr_registry.cc
unittest_sharedptr_registry_CXXFLAGS = $(UNITTEST_CXXFLAGS)
unittest_sharedptr_registry_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
check_PROGRAMS += unittest_sharedptr_registry
+unittest_sloppy_crc_map_SOURCES = test/common/test_sloppy_crc_map.cc
+unittest_sloppy_crc_map_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+unittest_sloppy_crc_map_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
+check_PROGRAMS += unittest_sloppy_crc_map
+
unittest_util_SOURCES = test/common/test_util.cc
unittest_util_CXXFLAGS = $(UNITTEST_CXXFLAGS)
unittest_util_LDADD = $(LIBCOMMON) -lm $(UNITTEST_LDADD) $(CRYPTO_LIBS) $(EXTRALIBS)
@@ -300,6 +318,11 @@ unittest_ceph_argparse_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
unittest_ceph_argparse_CXXFLAGS = $(UNITTEST_CXXFLAGS)
check_PROGRAMS += unittest_ceph_argparse
+unittest_ceph_compatset_SOURCES = test/ceph_compatset.cc
+unittest_ceph_compatset_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
+unittest_ceph_compatset_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+check_PROGRAMS += unittest_ceph_compatset
+
libec_example_la_SOURCES = test/osd/ErasureCodePluginExample.cc
libec_example_la_CFLAGS = ${AM_CFLAGS}
libec_example_la_CXXFLAGS= ${AM_CXXFLAGS}
@@ -314,6 +337,13 @@ libec_missing_entry_point_la_LIBADD = $(PTHREAD_LIBS) $(EXTRALIBS)
libec_missing_entry_point_la_LDFLAGS = ${AM_LDFLAGS} -export-symbols-regex '.*__erasure_code_.*'
erasure_codelib_LTLIBRARIES += libec_missing_entry_point.la
+libec_hangs_la_SOURCES = test/osd/ErasureCodePluginHangs.cc
+libec_hangs_la_CFLAGS = ${AM_CFLAGS}
+libec_hangs_la_CXXFLAGS= ${AM_CXXFLAGS}
+libec_hangs_la_LIBADD = $(PTHREAD_LIBS) $(EXTRALIBS)
+libec_hangs_la_LDFLAGS = ${AM_LDFLAGS} -export-symbols-regex '.*__erasure_code_.*'
+erasure_codelib_LTLIBRARIES += libec_hangs.la
+
libec_fail_to_initialize_la_SOURCES = test/osd/ErasureCodePluginFailToInitialize.cc
libec_fail_to_initialize_la_CFLAGS = ${AM_CFLAGS}
libec_fail_to_initialize_la_CXXFLAGS= ${AM_CXXFLAGS}
@@ -537,6 +567,8 @@ unittest_texttable_LDADD = $(LIBCOMMON) $(UNITTEST_LDADD)
unittest_texttable_CXXFLAGS = $(UNITTEST_CXXFLAGS)
check_PROGRAMS += unittest_texttable
+check_SCRIPTS += test/pybind/test_ceph_argparse.py
+
if WITH_RADOSGW
ceph_test_cors_SOURCES = test/test_cors.cc
ceph_test_cors_LDADD = \
diff --git a/src/test/ObjectMap/test_object_map.cc b/src/test/ObjectMap/test_object_map.cc
index 1b39c8068fb..23f220daf45 100644
--- a/src/test/ObjectMap/test_object_map.cc
+++ b/src/test/ObjectMap/test_object_map.cc
@@ -55,16 +55,16 @@ public:
}
void set_key(const string &objname, const string &key, const string &value) {
- set_key(hobject_t(sobject_t(objname, CEPH_NOSNAP)),
+ set_key(ghobject_t(hobject_t(sobject_t(objname, CEPH_NOSNAP))),
key, value);
}
void set_xattr(const string &objname, const string &key, const string &value) {
- set_xattr(hobject_t(sobject_t(objname, CEPH_NOSNAP)),
+ set_xattr(ghobject_t(hobject_t(sobject_t(objname, CEPH_NOSNAP))),
key, value);
}
- void set_key(hobject_t hoid,
+ void set_key(ghobject_t hoid,
string key, string value) {
map<string, bufferlist> to_write;
bufferptr bp(value.c_str(), value.size());
@@ -74,7 +74,7 @@ public:
db->set_keys(hoid, to_write);
}
- void set_xattr(hobject_t hoid,
+ void set_xattr(ghobject_t hoid,
string key, string value) {
map<string, bufferlist> to_write;
bufferptr bp(value.c_str(), value.size());
@@ -85,11 +85,11 @@ public:
}
void set_header(const string &objname, const string &value) {
- set_header(hobject_t(sobject_t(objname, CEPH_NOSNAP)),
+ set_header(ghobject_t(hobject_t(sobject_t(objname, CEPH_NOSNAP))),
value);
}
- void set_header(hobject_t hoid,
+ void set_header(ghobject_t hoid,
const string &value) {
bufferlist header;
header.append(bufferptr(value.c_str(), value.size() + 1));
@@ -97,11 +97,11 @@ public:
}
int get_header(const string &objname, string *value) {
- return get_header(hobject_t(sobject_t(objname, CEPH_NOSNAP)),
+ return get_header(ghobject_t(hobject_t(sobject_t(objname, CEPH_NOSNAP))),
value);
}
- int get_header(hobject_t hoid,
+ int get_header(ghobject_t hoid,
string *value) {
bufferlist header;
int r = db->get_header(hoid, &header);
@@ -115,11 +115,11 @@ public:
}
int get_xattr(const string &objname, const string &key, string *value) {
- return get_xattr(hobject_t(sobject_t(objname, CEPH_NOSNAP)),
+ return get_xattr(ghobject_t(hobject_t(sobject_t(objname, CEPH_NOSNAP))),
key, value);
}
- int get_xattr(hobject_t hoid,
+ int get_xattr(ghobject_t hoid,
string key, string *value) {
set<string> to_get;
to_get.insert(key);
@@ -135,11 +135,11 @@ public:
}
int get_key(const string &objname, const string &key, string *value) {
- return get_key(hobject_t(sobject_t(objname, CEPH_NOSNAP)),
+ return get_key(ghobject_t(hobject_t(sobject_t(objname, CEPH_NOSNAP))),
key, value);
}
- int get_key(hobject_t hoid,
+ int get_key(ghobject_t hoid,
string key, string *value) {
set<string> to_get;
to_get.insert(key);
@@ -155,11 +155,11 @@ public:
}
void remove_key(const string &objname, const string &key) {
- remove_key(hobject_t(sobject_t(objname, CEPH_NOSNAP)),
+ remove_key(ghobject_t(hobject_t(sobject_t(objname, CEPH_NOSNAP))),
key);
}
- void remove_key(hobject_t hoid,
+ void remove_key(ghobject_t hoid,
string key) {
set<string> to_remove;
to_remove.insert(key);
@@ -167,11 +167,11 @@ public:
}
void remove_xattr(const string &objname, const string &key) {
- remove_xattr(hobject_t(sobject_t(objname, CEPH_NOSNAP)),
+ remove_xattr(ghobject_t(hobject_t(sobject_t(objname, CEPH_NOSNAP))),
key);
}
- void remove_xattr(hobject_t hoid,
+ void remove_xattr(ghobject_t hoid,
string key) {
set<string> to_remove;
to_remove.insert(key);
@@ -179,20 +179,20 @@ public:
}
void clone(const string &objname, const string &target) {
- clone(hobject_t(sobject_t(objname, CEPH_NOSNAP)),
- hobject_t(sobject_t(target, CEPH_NOSNAP)));
+ clone(ghobject_t(hobject_t(sobject_t(objname, CEPH_NOSNAP))),
+ ghobject_t(hobject_t(sobject_t(target, CEPH_NOSNAP))));
}
- void clone(hobject_t hoid,
- hobject_t hoid2) {
+ void clone(ghobject_t hoid,
+ ghobject_t hoid2) {
db->clone(hoid, hoid2);
}
void clear(const string &objname) {
- clear(hobject_t(sobject_t(objname, CEPH_NOSNAP)));
+ clear(ghobject_t(hobject_t(sobject_t(objname, CEPH_NOSNAP))));
}
- void clear(hobject_t hoid) {
+ void clear(ghobject_t hoid) {
db->clear(hoid);
}
@@ -543,7 +543,7 @@ int main(int argc, char **argv) {
}
TEST_F(ObjectMapTest, CreateOneObject) {
- hobject_t hoid(sobject_t("foo", CEPH_NOSNAP));
+ ghobject_t hoid(hobject_t(sobject_t("foo", CEPH_NOSNAP)), 100, 0);
map<string, bufferlist> to_set;
string key("test");
string val("test_val");
@@ -579,8 +579,8 @@ TEST_F(ObjectMapTest, CreateOneObject) {
}
TEST_F(ObjectMapTest, CloneOneObject) {
- hobject_t hoid(sobject_t("foo", CEPH_NOSNAP));
- hobject_t hoid2(sobject_t("foo2", CEPH_NOSNAP));
+ ghobject_t hoid(hobject_t(sobject_t("foo", CEPH_NOSNAP)), 200, 0);
+ ghobject_t hoid2(hobject_t(sobject_t("foo2", CEPH_NOSNAP)), 201, 1);
tester.set_key(hoid, "foo", "bar");
tester.set_key(hoid, "foo2", "bar2");
@@ -640,8 +640,8 @@ TEST_F(ObjectMapTest, CloneOneObject) {
}
TEST_F(ObjectMapTest, OddEvenClone) {
- hobject_t hoid(sobject_t("foo", CEPH_NOSNAP));
- hobject_t hoid2(sobject_t("foo2", CEPH_NOSNAP));
+ ghobject_t hoid(hobject_t(sobject_t("foo", CEPH_NOSNAP)));
+ ghobject_t hoid2(hobject_t(sobject_t("foo2", CEPH_NOSNAP)));
for (unsigned i = 0; i < 1000; ++i) {
tester.set_key(hoid, "foo" + num_str(i), "bar" + num_str(i));
diff --git a/src/test/ObjectMap/test_store_tool/test_store_tool.cc b/src/test/ObjectMap/test_store_tool/test_store_tool.cc
index f81598ccfb8..8fcf3f30e82 100644
--- a/src/test/ObjectMap/test_store_tool/test_store_tool.cc
+++ b/src/test/ObjectMap/test_store_tool/test_store_tool.cc
@@ -24,6 +24,7 @@
#include "common/errno.h"
#include "common/safe_io.h"
#include "common/config.h"
+#include "common/strtol.h"
using namespace std;
@@ -38,7 +39,7 @@ class StoreTool
db.reset(db_ptr);
}
- void list(const string &prefix) {
+ void list(const string &prefix, const bool do_crc) {
KeyValueDB::WholeSpaceIterator iter = db->get_iterator();
if (prefix.empty())
@@ -51,7 +52,11 @@ class StoreTool
if (!prefix.empty() && (rk.first != prefix))
break;
- std::cout << rk.first << ":" << rk.second << std::endl;
+ std::cout << rk.first << ":" << rk.second;
+ if (do_crc) {
+ std::cout << " (" << iter->value().crc32c(0) << ")";
+ }
+ std::cout << std::endl;
iter->next();
}
}
@@ -79,7 +84,7 @@ class StoreTool
assert(!prefix.empty() && !key.empty());
map<string,bufferlist> result;
- set<string> keys;
+ std::set<std::string> keys;
keys.insert(key);
db->get(prefix, keys, &result);
@@ -101,6 +106,18 @@ class StoreTool
std::cout << "total: " << s << std::endl;
return s;
}
+
+ bool set(const string &prefix, const string &key, bufferlist &val) {
+ assert(!prefix.empty());
+ assert(!key.empty());
+ assert(val.length() > 0);
+
+ KeyValueDB::Transaction tx = db->get_transaction();
+ tx->set(prefix, key, val);
+ int ret = db->submit_transaction_sync(tx);
+
+ return (ret == 0);
+ }
};
void usage(const char *pname)
@@ -109,10 +126,12 @@ void usage(const char *pname)
<< "\n"
<< "Commands:\n"
<< " list [prefix]\n"
+ << " list-crc [prefix]\n"
<< " exists <prefix> [key]\n"
<< " get <prefix> <key>\n"
- << " verify <store path>\n"
+ << " crc <prefix> <key>\n"
<< " get-size\n"
+ << " set <prefix> <key> [ver <N>|in <file>]\n"
<< std::endl;
}
@@ -140,12 +159,14 @@ int main(int argc, const char *argv[])
StoreTool st(path);
- if (cmd == "list") {
+ if (cmd == "list" || cmd == "list-crc") {
string prefix;
if (argc > 3)
prefix = argv[3];
- st.list(prefix);
+ bool do_crc = (cmd == "list-crc");
+
+ st.list(prefix, do_crc);
} else if (cmd == "exists") {
string key;
@@ -183,10 +204,63 @@ int main(int argc, const char *argv[])
bl.hexdump(os);
std::cout << os.str() << std::endl;
- } else if (cmd == "verify") {
- assert(0);
+ } else if (cmd == "crc") {
+ if (argc < 5) {
+ usage(argv[0]);
+ return 1;
+ }
+ string prefix(argv[3]);
+ string key(argv[4]);
+
+ bool exists = false;
+ bufferlist bl = st.get(prefix, key, exists);
+ std::cout << "(" << prefix << ", " << key << ") ";
+ if (!exists) {
+ std::cout << " does not exist" << std::endl;
+ return 1;
+ }
+ std::cout << " crc " << bl.crc32c(0) << std::endl;
+
} else if (cmd == "get-size") {
std::cout << "estimated store size: " << st.get_size() << std::endl;
+
+ } else if (cmd == "set") {
+ if (argc < 7) {
+ usage(argv[0]);
+ return 1;
+ }
+ string prefix(argv[3]);
+ string key(argv[4]);
+ string subcmd(argv[5]);
+
+ bufferlist val;
+ string errstr;
+ if (subcmd == "ver") {
+ version_t v = (version_t) strict_strtoll(argv[6], 10, &errstr);
+ if (!errstr.empty()) {
+ std::cerr << "error reading version: " << errstr << std::endl;
+ return 1;
+ }
+ ::encode(v, val);
+ } else if (subcmd == "in") {
+ int ret = val.read_file(argv[6], &errstr);
+ if (ret < 0 || !errstr.empty()) {
+ std::cerr << "error reading file: " << errstr << std::endl;
+ return 1;
+ }
+ } else {
+ std::cerr << "unrecognized subcommand '" << subcmd << "'" << std::endl;
+ usage(argv[0]);
+ return 1;
+ }
+
+ bool ret = st.set(prefix, key, val);
+ if (!ret) {
+ std::cerr << "error setting ("
+ << prefix << "," << key << ")" << std::endl;
+ return 1;
+ }
+
} else {
std::cerr << "Unrecognized command: " << cmd << std::endl;
return 1;
diff --git a/src/test/ceph_compatset.cc b/src/test/ceph_compatset.cc
new file mode 100644
index 00000000000..2b57db01ab9
--- /dev/null
+++ b/src/test/ceph_compatset.cc
@@ -0,0 +1,164 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <fstream>
+#include <iostream>
+#include <errno.h>
+#include <sys/stat.h>
+#include <signal.h>
+#include <ctype.h>
+#include <boost/scoped_ptr.hpp>
+#include <string>
+
+#include "include/types.h"
+#include "include/compat.h"
+
+//#undef assert
+//#define assert(foo) if (!(foo)) abort();
+
+#include "include/CompatSet.h"
+
+#include "gtest/gtest.h"
+#include <vector>
+
+TEST(CephCompatSet, AllSet) {
+ CompatSet::FeatureSet compat;
+ CompatSet::FeatureSet ro;
+ CompatSet::FeatureSet incompat;
+
+ EXPECT_THROW(compat.insert(CompatSet::Feature(0, "test")), FailedAssertion);
+ EXPECT_THROW(compat.insert(CompatSet::Feature(64, "test")), FailedAssertion);
+
+ for (int i = 1; i < 64; i++) {
+ stringstream cname;
+ cname << string("c") << i;
+ compat.insert(CompatSet::Feature(i,cname.str().c_str()));
+ stringstream roname;
+ roname << string("r") << i;
+ ro.insert(CompatSet::Feature(i,roname.str().c_str()));
+ stringstream iname;
+ iname << string("i") << i;
+ incompat.insert(CompatSet::Feature(i,iname.str().c_str()));
+ }
+ CompatSet tcs(compat, ro, incompat);
+
+ //cout << tcs << std::endl;
+
+ //Due to a workaround for a bug bit 0 is always set even though it is
+ //not a legal feature.
+ EXPECT_EQ(tcs.compat.mask, (uint64_t)0xffffffffffffffff);
+ EXPECT_EQ(tcs.ro_compat.mask, (uint64_t)0xffffffffffffffff);
+ EXPECT_EQ(tcs.incompat.mask, (uint64_t)0xffffffffffffffff);
+
+ for (int i = 1; i < 64; i++) {
+ EXPECT_TRUE(tcs.compat.contains(i));
+ stringstream cname;
+ cname << string("c") << i;
+ EXPECT_TRUE(tcs.compat.contains(CompatSet::Feature(i,cname.str().c_str())));
+ tcs.compat.remove(i);
+
+ EXPECT_TRUE(tcs.ro_compat.contains(i));
+ stringstream roname;
+ roname << string("r") << i;
+ EXPECT_TRUE(tcs.ro_compat.contains(CompatSet::Feature(i,roname.str().c_str())));
+ tcs.ro_compat.remove(i);
+
+ EXPECT_TRUE(tcs.incompat.contains(i));
+ stringstream iname;
+ iname << string("i") << i;
+ EXPECT_TRUE(tcs.incompat.contains(CompatSet::Feature(i,iname.str().c_str())));
+ tcs.incompat.remove(i);
+ }
+ //Due to a workaround for a bug bit 0 is always set even though it is
+ //not a legal feature.
+ EXPECT_EQ(tcs.compat.mask, (uint64_t)1);
+ EXPECT_TRUE(tcs.compat.names.empty());
+ EXPECT_EQ(tcs.ro_compat.mask, (uint64_t)1);
+ EXPECT_TRUE(tcs.ro_compat.names.empty());
+ EXPECT_EQ(tcs.incompat.mask, (uint64_t)1);
+ EXPECT_TRUE(tcs.incompat.names.empty());
+}
+
+TEST(CephCompatSet, other) {
+ CompatSet s1, s2, s1dup;
+
+ s1.compat.insert(CompatSet::Feature(1, "c1"));
+ s1.compat.insert(CompatSet::Feature(2, "c2"));
+ s1.compat.insert(CompatSet::Feature(32, "c32"));
+ s1.ro_compat.insert(CompatSet::Feature(63, "r63"));
+ s1.incompat.insert(CompatSet::Feature(1, "i1"));
+
+ s2.compat.insert(CompatSet::Feature(1, "c1"));
+ s2.compat.insert(CompatSet::Feature(32, "c32"));
+ s2.ro_compat.insert(CompatSet::Feature(63, "r63"));
+ s2.incompat.insert(CompatSet::Feature(1, "i1"));
+
+ s1dup = s1;
+
+ //Check exact match
+ EXPECT_EQ(s1.compare(s1dup), 0);
+
+ //Check superset
+ EXPECT_EQ(s1.compare(s2), 1);
+
+ //Check missing features
+ EXPECT_EQ(s2.compare(s1), -1);
+
+ CompatSet diff = s2.unsupported(s1);
+ EXPECT_EQ(diff.compat.mask, (uint64_t)1<<2 | 1);
+ EXPECT_EQ(diff.ro_compat.mask, (uint64_t)1);
+ EXPECT_EQ(diff.incompat.mask, (uint64_t)1);
+
+ CompatSet s3 = s1;
+ s3.incompat.insert(CompatSet::Feature(4, "i4"));
+
+ diff = s1.unsupported(s3);
+ EXPECT_EQ(diff.compat.mask, (uint64_t)1);
+ EXPECT_EQ(diff.ro_compat.mask, (uint64_t)1);
+ EXPECT_EQ(diff.incompat.mask, (uint64_t)1<<4 | 1);
+}
+
+TEST(CephCompatSet, merge) {
+ CompatSet s1, s2, s1dup, s2dup;
+
+ s1.compat.insert(CompatSet::Feature(1, "c1"));
+ s1.compat.insert(CompatSet::Feature(2, "c2"));
+ s1.compat.insert(CompatSet::Feature(32, "c32"));
+ s1.ro_compat.insert(CompatSet::Feature(63, "r63"));
+ s1.incompat.insert(CompatSet::Feature(1, "i1"));
+
+ s1dup = s1;
+
+ s2.compat.insert(CompatSet::Feature(1, "c1"));
+ s2.compat.insert(CompatSet::Feature(32, "c32"));
+ s2.ro_compat.insert(CompatSet::Feature(1, "r1"));
+ s2.ro_compat.insert(CompatSet::Feature(63, "r63"));
+ s2.incompat.insert(CompatSet::Feature(1, "i1"));
+
+ s2dup = s2;
+
+ //Nothing to merge if they are the same
+ EXPECT_FALSE(s1.merge(s1dup));
+ EXPECT_FALSE(s2.merge(s2dup));
+
+ EXPECT_TRUE(s1.merge(s2));
+ EXPECT_EQ(s1.compat.mask, (uint64_t)1<<1 | (uint64_t)1<<2 | (uint64_t)1<<32 | 1);
+ EXPECT_EQ(s1.ro_compat.mask, (uint64_t)1<<1 | (uint64_t)1<<63 | 1);
+ EXPECT_EQ(s1.incompat.mask, (uint64_t)1<<1 | 1);
+
+ EXPECT_TRUE(s2.merge(s1dup));
+ EXPECT_EQ(s2.compat.mask, (uint64_t)1<<1 | (uint64_t)1<<2 | (uint64_t)1<<32 | 1);
+ EXPECT_EQ(s2.ro_compat.mask, (uint64_t)1<<1 | (uint64_t)1<<63 | 1);
+ EXPECT_EQ(s2.incompat.mask, (uint64_t)1<<1 | 1);
+}
diff --git a/src/test/cli-integration/rbd/formatted-output.t b/src/test/cli-integration/rbd/formatted-output.t
index bece14f11f1..707e0749367 100644
--- a/src/test/cli-integration/rbd/formatted-output.t
+++ b/src/test/cli-integration/rbd/formatted-output.t
@@ -39,7 +39,7 @@ For now, use a more inclusive regex.
$ rbd info foo
rbd image 'foo':
\tsize 1024 MB in 256 objects (esc)
- \torder 22 (4096 KB objects) (esc)
+ \torder 22 (4096 kB objects) (esc)
[^^]+ (re)
\tformat: 1 (esc)
$ rbd info foo --format json | python -mjson.tool
@@ -67,7 +67,7 @@ whenever it is run. grep -v to ignore it, but still work on other distros.
$ rbd info foo@snap
rbd image 'foo':
\tsize 1024 MB in 256 objects (esc)
- \torder 22 (4096 KB objects) (esc)
+ \torder 22 (4096 kB objects) (esc)
[^^]+ (re)
\tformat: 1 (esc)
\tprotected: False (esc)
@@ -96,7 +96,7 @@ whenever it is run. grep -v to ignore it, but still work on other distros.
$ rbd info bar
rbd image 'bar':
\tsize 1024 MB in 256 objects (esc)
- \torder 22 (4096 KB objects) (esc)
+ \torder 22 (4096 kB objects) (esc)
[^^]+ (re)
\tformat: 2 (esc)
\tfeatures: layering (esc)
@@ -131,7 +131,7 @@ whenever it is run. grep -v to ignore it, but still work on other distros.
$ rbd info bar@snap
rbd image 'bar':
\tsize 512 MB in 128 objects (esc)
- \torder 22 (4096 KB objects) (esc)
+ \torder 22 (4096 kB objects) (esc)
[^^]+ (re)
\tformat: 2 (esc)
\tfeatures: layering (esc)
@@ -169,7 +169,7 @@ whenever it is run. grep -v to ignore it, but still work on other distros.
$ rbd info bar@snap2
rbd image 'bar':
\tsize 1024 MB in 256 objects (esc)
- \torder 22 (4096 KB objects) (esc)
+ \torder 22 (4096 kB objects) (esc)
[^^]+ (re)
\tformat: 2 (esc)
\tfeatures: layering (esc)
@@ -207,7 +207,7 @@ whenever it is run. grep -v to ignore it, but still work on other distros.
$ rbd info baz
rbd image 'baz':
\tsize 2048 MB in 512 objects (esc)
- \torder 22 (4096 KB objects) (esc)
+ \torder 22 (4096 kB objects) (esc)
[^^]+ (re)
\tformat: 2 (esc)
\tfeatures: layering (esc)
@@ -241,8 +241,8 @@ whenever it is run. grep -v to ignore it, but still work on other distros.
</image>
$ rbd info quux
rbd image 'quux':
- \tsize 1024 KB in 1 objects (esc)
- \torder 22 (4096 KB objects) (esc)
+ \tsize 1024 kB in 1 objects (esc)
+ \torder 22 (4096 kB objects) (esc)
[^^]+ (re)
\tformat: 1 (esc)
$ rbd info quux --format json | python -mjson.tool
@@ -268,7 +268,7 @@ whenever it is run. grep -v to ignore it, but still work on other distros.
$ rbd info data/child
rbd image 'child':
\tsize 512 MB in 128 objects (esc)
- \torder 22 (4096 KB objects) (esc)
+ \torder 22 (4096 kB objects) (esc)
[^^]+ (re)
\tformat: 2 (esc)
\tfeatures: layering (esc)
@@ -303,7 +303,7 @@ whenever it is run. grep -v to ignore it, but still work on other distros.
$ rbd info data/child@snap
rbd image 'child':
\tsize 512 MB in 128 objects (esc)
- \torder 22 (4096 KB objects) (esc)
+ \torder 22 (4096 kB objects) (esc)
[^^]+ (re)
\tformat: 2 (esc)
\tfeatures: layering (esc)
@@ -375,7 +375,7 @@ whenever it is run. grep -v to ignore it, but still work on other distros.
NAME SIZE PARENT FMT PROT LOCK
foo 1024M 1
foo@snap 1024M 1
- quux 1024K 1 excl
+ quux 1024k 1 excl
bar 1024M 2
bar@snap 512M 2 yes
bar@snap2 1024M 2
diff --git a/src/test/cli/radosgw-admin/help.t b/src/test/cli/radosgw-admin/help.t
index 417c04ee777..4fe30b1cda7 100644
--- a/src/test/cli/radosgw-admin/help.t
+++ b/src/test/cli/radosgw-admin/help.t
@@ -125,9 +125,10 @@
--max-size specify max size (in bytes)
--quota-scope scope of quota (bucket, user)
- --conf/-c Read configuration from the given configuration file
- --id/-i set ID portion of my name
- --name/-n set name (TYPE.ID)
- --version show version and quit
+ --conf/-c FILE read configuration from the given configuration file
+ --id/-i ID set ID portion of my name
+ --name/-n TYPE.ID set name
+ --cluster NAME set cluster name (default: ceph)
+ --version show version and quit
[1]
diff --git a/src/test/cli/rbd/help.t b/src/test/cli/rbd/help.t
index 1ad79385a7e..754e11f9357 100644
--- a/src/test/cli/rbd/help.t
+++ b/src/test/cli/rbd/help.t
@@ -76,4 +76,5 @@
--pretty-format make json or xml output more readable
--no-settle do not wait for udevadm to settle on map/unmap
--no-progress do not show progress for long-running commands
+ --read-only set device readonly when mapping image
--allow-shrink allow shrinking of an image when resizing
diff --git a/src/test/common/get_command_descriptions.cc b/src/test/common/get_command_descriptions.cc
new file mode 100644
index 00000000000..aff5575b8c4
--- /dev/null
+++ b/src/test/common/get_command_descriptions.cc
@@ -0,0 +1,116 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com>
+ *
+ * Author: Loic Dachary <loic@dachary.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Library Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Library Public License for more details.
+ *
+ */
+
+#include <stdio.h>
+#include <signal.h>
+#include "mon/Monitor.h"
+#include "common/ceph_argparse.h"
+#include "global/global_init.h"
+#include "common/debug.h"
+
+#define dout_subsys ceph_subsys_mon
+
+static void usage(ostream &out)
+{
+ out << "usage: get_command_descriptions [options ...]" << std::endl;
+ out << "print on stdout the result of JSON formatted options\n";
+ out << "found in mon/MonCommands.h as produced by the\n";
+ out << "Monitor.cc::get_command_descriptions function.\n";
+ out << "Designed as a helper for ceph_argparse.py unit tests.\n";
+ out << "\n";
+ out << " --all all of mon/MonCommands.h \n";
+ out << " --pull585 reproduce the bug fixed by #585\n";
+ out << "\n";
+ out << "Examples:\n";
+ out << " get_command_descriptions --all\n";
+ out << " get_command_descriptions --pull585\n";
+}
+
+static void json_print(const MonCommand *mon_commands, int size)
+{
+ bufferlist rdata;
+ Formatter *f = new_formatter("json");
+ get_command_descriptions(mon_commands, size, f, &rdata);
+ delete f;
+ string data(rdata.c_str(), rdata.length());
+ dout(0) << data << dendl;
+}
+
+static void all()
+{
+#undef COMMAND
+ MonCommand mon_commands[] = {
+#define COMMAND(parsesig, helptext, modulename, req_perms, avail) \
+ {parsesig, helptext, modulename, req_perms, avail},
+#include <mon/MonCommands.h>
+ };
+
+ json_print(mon_commands, ARRAY_SIZE(mon_commands));
+}
+
+// syntax error https://github.com/ceph/ceph/pull/585
+static void pull585()
+{
+ MonCommand mon_commands[] = {
+ { "osd pool create "
+ "name=pool,type=CephPoolname "
+ "name=pg_num,type=CephInt,range=0 "
+ "name=pgp_num,type=CephInt,range=0,req=false" // !!! missing trailing space
+ "name=properties,type=CephString,n=N,req=false,goodchars=[A-Za-z0-9-_.=]",
+ "create pool", "osd", "rw", "cli,rest" }
+ };
+
+ json_print(mon_commands, ARRAY_SIZE(mon_commands));
+}
+
+int main(int argc, char **argv) {
+ vector<const char*> args;
+ argv_to_vec(argc, (const char **)argv, args);
+
+ global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, CODE_ENVIRONMENT_UTILITY, 0);
+ common_init_finish(g_ceph_context);
+
+ if (args.empty()) {
+ usage(cerr);
+ exit(1);
+ }
+ for (std::vector<const char*>::iterator i = args.begin(); i != args.end(); ++i) {
+ string err;
+
+ if (*i == string("help") || *i == string("-h") || *i == string("--help")) {
+ usage(cout);
+ exit(0);
+ } else if (*i == string("--all")) {
+ all();
+ } else if (*i == string("--pull585")) {
+ pull585();
+ }
+ }
+}
+
+/*
+ * Local Variables:
+ * compile-command: "cd ../.. ;
+ * make get_command_descriptions &&
+ * ./get_command_descriptions --all --pull585"
+ * End:
+ */
+
diff --git a/src/test/common/test_bloom_filter.cc b/src/test/common/test_bloom_filter.cc
new file mode 100644
index 00000000000..cfd41305caa
--- /dev/null
+++ b/src/test/common/test_bloom_filter.cc
@@ -0,0 +1,289 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Inktank <info@inktank.com>
+ *
+ * LGPL2.1 (see COPYING-LGPL2.1) or later
+ */
+
+#include <iostream>
+#include <gtest/gtest.h>
+
+#include "include/stringify.h"
+#include "common/bloom_filter.hpp"
+
+TEST(BloomFilter, Basic) {
+ bloom_filter bf(10, .1, 1);
+ bf.insert("foo");
+ bf.insert("bar");
+
+ ASSERT_TRUE(bf.contains("foo"));
+ ASSERT_TRUE(bf.contains("bar"));
+}
+
+TEST(BloomFilter, Empty) {
+ bloom_filter bf;
+ for (int i=0; i<100; ++i) {
+ ASSERT_FALSE(bf.contains(i));
+ ASSERT_FALSE(bf.contains(stringify(i)));
+ }
+}
+
+TEST(BloomFilter, Sweep) {
+ std::cout.setf(std::ios_base::fixed, std::ios_base::floatfield);
+ std::cout.precision(5);
+ std::cout << "# max\tfpp\tactual\tsize\tB/insert" << std::endl;
+ for (int ex = 3; ex < 12; ex += 2) {
+ for (float fpp = .001; fpp < .5; fpp *= 4.0) {
+ int max = 2 << ex;
+ bloom_filter bf(max, fpp, 1);
+ bf.insert("foo");
+ bf.insert("bar");
+
+ ASSERT_TRUE(bf.contains("foo"));
+ ASSERT_TRUE(bf.contains("bar"));
+
+ for (int n = 0; n < max; n++)
+ bf.insert("ok" + stringify(n));
+
+ int test = max * 100;
+ int hit = 0;
+ for (int n = 0; n < test; n++)
+ if (bf.contains("asdf" + stringify(n)))
+ hit++;
+
+ ASSERT_TRUE(bf.contains("foo"));
+ ASSERT_TRUE(bf.contains("bar"));
+
+ double actual = (double)hit / (double)test;
+
+ bufferlist bl;
+ ::encode(bf, bl);
+
+ double byte_per_insert = (double)bl.length() / (double)max;
+
+ std::cout << max << "\t" << fpp << "\t" << actual << "\t" << bl.length() << "\t" << byte_per_insert << std::endl;
+ ASSERT_TRUE(actual < fpp * 10);
+
+ }
+ }
+}
+
+TEST(BloomFilter, SweepInt) {
+ std::cout.setf(std::ios_base::fixed, std::ios_base::floatfield);
+ std::cout.precision(5);
+ std::cout << "# max\tfpp\tactual\tsize\tB/insert\tdensity\tapprox_element_count" << std::endl;
+ for (int ex = 3; ex < 12; ex += 2) {
+ for (float fpp = .001; fpp < .5; fpp *= 4.0) {
+ int max = 2 << ex;
+ bloom_filter bf(max, fpp, 1);
+ bf.insert("foo");
+ bf.insert("bar");
+
+ ASSERT_TRUE(123);
+ ASSERT_TRUE(456);
+
+ for (int n = 0; n < max; n++)
+ bf.insert(n);
+
+ int test = max * 100;
+ int hit = 0;
+ for (int n = 0; n < test; n++)
+ if (bf.contains(100000 + n))
+ hit++;
+
+ ASSERT_TRUE(123);
+ ASSERT_TRUE(456);
+
+ double actual = (double)hit / (double)test;
+
+ bufferlist bl;
+ ::encode(bf, bl);
+
+ double byte_per_insert = (double)bl.length() / (double)max;
+
+ std::cout << max << "\t" << fpp << "\t" << actual << "\t" << bl.length() << "\t" << byte_per_insert
+ << "\t" << bf.density() << "\t" << bf.approx_unique_element_count() << std::endl;
+ ASSERT_TRUE(actual < fpp * 10);
+ ASSERT_TRUE(actual > fpp / 10);
+ ASSERT_TRUE(bf.density() > 0.40);
+ ASSERT_TRUE(bf.density() < 0.60);
+ }
+ }
+}
+
+
+TEST(BloomFilter, CompressibleSweep) {
+ std::cout.setf(std::ios_base::fixed, std::ios_base::floatfield);
+ std::cout.precision(5);
+ std::cout << "# max\tins\test ins\tafter\ttgtfpp\tactual\tsize\tb/elem\n";
+ float fpp = .01;
+ int max = 1024;
+ for (int div = 1; div < 10; div++) {
+ compressible_bloom_filter bf(max, fpp, 1);
+ int t = max/div;
+ for (int n = 0; n < t; n++)
+ bf.insert(n);
+
+ unsigned est = bf.approx_unique_element_count();
+ if (div > 1)
+ bf.compress(1.0 / div);
+
+ for (int n = 0; n < t; n++)
+ ASSERT_TRUE(bf.contains(n));
+
+ int test = max * 100;
+ int hit = 0;
+ for (int n = 0; n < test; n++)
+ if (bf.contains(100000 + n))
+ hit++;
+
+ double actual = (double)hit / (double)test;
+
+ bufferlist bl;
+ ::encode(bf, bl);
+
+ double byte_per_insert = (double)bl.length() / (double)max;
+ unsigned est_after = bf.approx_unique_element_count();
+ std::cout << max
+ << "\t" << t
+ << "\t" << est
+ << "\t" << est_after
+ << "\t" << fpp
+ << "\t" << actual
+ << "\t" << bl.length() << "\t" << byte_per_insert
+ << std::endl;
+
+ ASSERT_TRUE(actual < fpp * 2.0);
+ ASSERT_TRUE(actual > fpp / 2.0);
+ ASSERT_TRUE(est_after < est * 2);
+ ASSERT_TRUE(est_after > est / 2);
+ }
+}
+
+
+
+TEST(BloomFilter, BinSweep) {
+ std::cout.setf(std::ios_base::fixed, std::ios_base::floatfield);
+ std::cout.precision(5);
+ int total_max = 16384;
+ float total_fpp = .01;
+ std::cout << "total_inserts " << total_max << " target-fpp " << total_fpp << std::endl;
+ for (int bins = 1; bins < 16; ++bins) {
+ int max = total_max / bins;
+ float fpp = total_fpp / bins;//pow(total_fpp, bins);
+
+ std::vector<bloom_filter*> ls;
+ bufferlist bl;
+ for (int i=0; i<bins; i++) {
+ ls.push_back(new bloom_filter(max, fpp, i));
+ for (int j=0; j<max; j++) {
+ ls.back()->insert(10000 * (i+1) + j);
+ }
+ ::encode(*ls.front(), bl);
+ }
+
+ int hit = 0;
+ int test = max * 100;
+ for (int i=0; i<test; ++i) {
+ for (std::vector<bloom_filter*>::iterator j = ls.begin(); j != ls.end(); ++j) {
+ if ((*j)->contains(i * 732)) { // note: sequential i does not work here; the intenral int hash is weak!!
+ hit++;
+ break;
+ }
+ }
+ }
+
+ double actual = (double)hit / (double)test;
+ std::cout << "bins " << bins << " bin-max " << max << " bin-fpp " << fpp
+ << " actual-fpp " << actual
+ << " total-size " << bl.length() << std::endl;
+ }
+}
+
+// disable these tests; doing dual insertions in consecutive filters
+// appears to be equivalent to doing a single insertion in a bloom
+// filter that is twice as big.
+#if 0
+
+// test the fpp over a sequence of bloom filters, each with unique
+// items inserted into it.
+//
+// we expect: actual_fpp = num_filters * per_filter_fpp
+TEST(BloomFilter, Sequence) {
+
+ int max = 1024;
+ double fpp = .01;
+ for (int seq = 2; seq <= 128; seq *= 2) {
+ std::vector<bloom_filter*> ls;
+ for (int i=0; i<seq; i++) {
+ ls.push_back(new bloom_filter(max*2, fpp, i));
+ for (int j=0; j<max; j++) {
+ ls.back()->insert("ok" + stringify(j) + "_" + stringify(i));
+ if (ls.size() > 1)
+ ls[ls.size() - 2]->insert("ok" + stringify(j) + "_" + stringify(i));
+ }
+ }
+
+ int hit = 0;
+ int test = max * 100;
+ for (int i=0; i<test; ++i) {
+ for (std::vector<bloom_filter*>::iterator j = ls.begin(); j != ls.end(); ++j) {
+ if ((*j)->contains("bad" + stringify(i))) {
+ hit++;
+ break;
+ }
+ }
+ }
+
+ double actual = (double)hit / (double)test;
+ std::cout << "seq " << seq << " max " << max << " fpp " << fpp << " actual " << actual << std::endl;
+ }
+}
+
+// test the ffp over a sequence of bloom filters, where actual values
+// are always inserted into a consecutive pair of filters. in order
+// to have a false positive, we need to falsely match two consecutive
+// filters.
+//
+// we expect: actual_fpp = num_filters * per_filter_fpp^2
+TEST(BloomFilter, SequenceDouble) {
+ int max = 1024;
+ double fpp = .01;
+ for (int seq = 2; seq <= 128; seq *= 2) {
+ std::vector<bloom_filter*> ls;
+ for (int i=0; i<seq; i++) {
+ ls.push_back(new bloom_filter(max*2, fpp, i));
+ for (int j=0; j<max; j++) {
+ ls.back()->insert("ok" + stringify(j) + "_" + stringify(i));
+ if (ls.size() > 1)
+ ls[ls.size() - 2]->insert("ok" + stringify(j) + "_" + stringify(i));
+ }
+ }
+
+ int hit = 0;
+ int test = max * 100;
+ int run = 0;
+ for (int i=0; i<test; ++i) {
+ for (std::vector<bloom_filter*>::iterator j = ls.begin(); j != ls.end(); ++j) {
+ if ((*j)->contains("bad" + stringify(i))) {
+ run++;
+ if (run >= 2) {
+ hit++;
+ break;
+ }
+ } else {
+ run = 0;
+ }
+ }
+ }
+
+ double actual = (double)hit / (double)test;
+ std::cout << "seq " << seq << " max " << max << " fpp " << fpp << " actual " << actual
+ << " expected " << (fpp*fpp*(double)seq) << std::endl;
+ }
+}
+
+#endif
diff --git a/src/test/common/test_sloppy_crc_map.cc b/src/test/common/test_sloppy_crc_map.cc
new file mode 100644
index 00000000000..2650f4f960d
--- /dev/null
+++ b/src/test/common/test_sloppy_crc_map.cc
@@ -0,0 +1,113 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/SloppyCRCMap.h"
+#include "common/Formatter.h"
+#include <gtest/gtest.h>
+
+void dump(const SloppyCRCMap& scm)
+{
+ Formatter *f = new_formatter("json-pretty");
+ f->open_object_section("map");
+ scm.dump(f);
+ f->close_section();
+ f->flush(cout);
+ delete f;
+}
+
+TEST(SloppyCRCMap, basic) {
+ SloppyCRCMap scm(4);
+
+ bufferlist a, b;
+ a.append("The quick brown fox jumped over a fence whose color I forget.");
+ b.append("asdf");
+
+ scm.write(0, a.length(), a);
+ if (0)
+ dump(scm);
+ ASSERT_EQ(0, scm.read(0, a.length(), a, &cout));
+
+ scm.write(12, b.length(), b);
+ if (0)
+ dump(scm);
+
+ ASSERT_EQ(0, scm.read(12, b.length(), b, &cout));
+ ASSERT_EQ(1, scm.read(0, a.length(), a, &cout));
+}
+
+TEST(SloppyCRCMap, truncate) {
+ SloppyCRCMap scm(4);
+
+ bufferlist a, b;
+ a.append("asdf");
+ b.append("qwer");
+
+ scm.write(0, a.length(), a);
+ scm.write(4, a.length(), a);
+ ASSERT_EQ(0, scm.read(4, 4, a, &cout));
+ ASSERT_EQ(1, scm.read(4, 4, b, &cout));
+ scm.truncate(4);
+ ASSERT_EQ(0, scm.read(4, 4, b, &cout));
+}
+
+TEST(SloppyCRCMap, zero) {
+ SloppyCRCMap scm(4);
+
+ bufferlist a, b;
+ a.append("asdf");
+ b.append("qwer");
+
+ scm.write(0, a.length(), a);
+ scm.write(4, a.length(), a);
+ ASSERT_EQ(0, scm.read(4, 4, a, &cout));
+ ASSERT_EQ(1, scm.read(4, 4, b, &cout));
+ scm.zero(4, 4);
+ ASSERT_EQ(1, scm.read(4, 4, a, &cout));
+ ASSERT_EQ(1, scm.read(4, 4, b, &cout));
+
+ bufferptr bp(4);
+ bp.zero();
+ bufferlist c;
+ c.append(bp);
+ ASSERT_EQ(0, scm.read(0, 4, a, &cout));
+ ASSERT_EQ(0, scm.read(4, 4, c, &cout));
+ scm.zero(0, 15);
+ ASSERT_EQ(1, scm.read(0, 4, a, &cout));
+ ASSERT_EQ(0, scm.read(0, 4, c, &cout));
+}
+
+TEST(SloppyCRCMap, clone_range) {
+ SloppyCRCMap src(4);
+ SloppyCRCMap dst(4);
+
+ bufferlist a, b;
+ a.append("asdfghjkl");
+ b.append("qwertyui");
+
+ src.write(0, a.length(), a);
+ src.write(8, a.length(), a);
+ src.write(16, a.length(), a);
+
+ dst.write(0, b.length(), b);
+ dst.clone_range(0, 8, 0, src);
+ ASSERT_EQ(2, dst.read(0, 8, b, &cout));
+ ASSERT_EQ(0, dst.read(8, 8, b, &cout));
+
+ dst.write(16, b.length(), b);
+ ASSERT_EQ(2, dst.read(16, 8, a, &cout));
+ dst.clone_range(16, 8, 16, src);
+ ASSERT_EQ(0, dst.read(16, 8, a, &cout));
+
+ dst.write(16, b.length(), b);
+ ASSERT_EQ(1, dst.read(16, 4, a, &cout));
+ dst.clone_range(16, 8, 2, src);
+ ASSERT_EQ(0, dst.read(16, 4, a, &cout));
+
+ dst.write(0, b.length(), b);
+ dst.write(8, b.length(), b);
+ ASSERT_EQ(2, dst.read(0, 8, a, &cout));
+ ASSERT_EQ(2, dst.read(8, 8, a, &cout));
+ dst.clone_range(2, 8, 0, src);
+ ASSERT_EQ(0, dst.read(0, 8, a, &cout));
+ ASSERT_EQ(0, dst.read(8, 4, a, &cout));
+}
diff --git a/src/test/common/test_util.cc b/src/test/common/test_util.cc
index 16713077cfc..cb22047c600 100644
--- a/src/test/common/test_util.cc
+++ b/src/test/common/test_util.cc
@@ -21,6 +21,7 @@ TEST(util, unit_to_bytesize)
{
ASSERT_EQ(1234ll, unit_to_bytesize("1234", &cerr));
ASSERT_EQ(1024ll, unit_to_bytesize("1K", &cerr));
+ ASSERT_EQ(1024ll, unit_to_bytesize("1k", &cerr));
ASSERT_EQ(1048576ll, unit_to_bytesize("1M", &cerr));
ASSERT_EQ(1073741824ll, unit_to_bytesize("1G", &cerr));
ASSERT_EQ(1099511627776ll, unit_to_bytesize("1T", &cerr));
diff --git a/src/test/encoding/types.h b/src/test/encoding/types.h
index fe17f077d8e..18ed795c3ef 100644
--- a/src/test/encoding/types.h
+++ b/src/test/encoding/types.h
@@ -4,6 +4,10 @@ TYPE(CompatSet)
#include "include/filepath.h"
TYPE(filepath)
+#include "common/bloom_filter.hpp"
+TYPE(bloom_filter)
+TYPE(compressible_bloom_filter)
+
#include "common/snap_types.h"
TYPE(SnapContext)
TYPE(SnapRealmInfo)
@@ -16,6 +20,9 @@ TYPE(LogEntryKey)
TYPE(LogEntry)
TYPE(LogSummary)
+#include "common/SloppyCRCMap.h"
+TYPE(SloppyCRCMap)
+
#include "msg/msg_types.h"
TYPE(entity_name_t)
TYPE(entity_addr_t)
@@ -29,13 +36,15 @@ TYPEWITHSTRAYDATA(OSDMap::Incremental)
#include "crush/CrushWrapper.h"
TYPE(CrushWrapper)
+#include "include/histogram.h"
+TYPE(pow2_hist_t)
+
#include "osd/osd_types.h"
TYPE(osd_reqid_t)
TYPE(object_locator_t)
TYPE(request_redirect_t)
TYPE(pg_t)
TYPE(coll_t)
-TYPE(pow2_hist_t)
TYPE(filestore_perf_stat_t)
TYPE(osd_stat_t)
TYPE(OSDSuperblock)
@@ -78,6 +87,7 @@ TYPE(SequencerPosition)
#include "common/hobject.h"
TYPE(hobject_t)
+TYPE(ghobject_t)
#include "mon/AuthMonitor.h"
TYPE(AuthMonitor::Incremental)
diff --git a/src/test/filestore/FileStoreDiff.cc b/src/test/filestore/FileStoreDiff.cc
index b2419f5e298..40c0b32d30c 100644
--- a/src/test/filestore/FileStoreDiff.cc
+++ b/src/test/filestore/FileStoreDiff.cc
@@ -131,7 +131,7 @@ bool FileStoreDiff::diff_objects(FileStore *a_store, FileStore *b_store, coll_t
bool ret = false;
int err;
- std::vector<hobject_t> b_objects, a_objects;
+ std::vector<ghobject_t> b_objects, a_objects;
err = b_store->collection_list(coll, b_objects);
if (err < 0) {
dout(0) << "diff_objects list on verify coll " << coll.to_str()
@@ -151,11 +151,11 @@ bool FileStoreDiff::diff_objects(FileStore *a_store, FileStore *b_store, coll_t
ret = true;
}
- std::vector<hobject_t>::iterator b_it = b_objects.begin();
- std::vector<hobject_t>::iterator a_it = b_objects.begin();
+ std::vector<ghobject_t>::iterator b_it = b_objects.begin();
+ std::vector<ghobject_t>::iterator a_it = b_objects.begin();
for (; b_it != b_objects.end(); ++b_it, ++a_it) {
- hobject_t b_obj = *b_it, a_obj = *a_it;
- if (b_obj.oid.name != a_obj.oid.name) {
+ ghobject_t b_obj = *b_it, a_obj = *a_it;
+ if (b_obj.hobj.oid.name != a_obj.hobj.oid.name) {
dout(0) << "diff_objects name mismatch on A object "
<< coll << "/" << a_obj << " and B object "
<< coll << "/" << b_obj << dendl;
@@ -167,7 +167,7 @@ bool FileStoreDiff::diff_objects(FileStore *a_store, FileStore *b_store, coll_t
err = b_store->stat(coll, b_obj, &b_stat);
if (err < 0) {
dout(0) << "diff_objects error stating B object "
- << coll.to_str() << "/" << b_obj.oid.name << dendl;
+ << coll.to_str() << "/" << b_obj.hobj.oid.name << dendl;
ret = true;
}
err = a_store->stat(coll, a_obj, &a_stat);
diff --git a/src/test/filestore/run_seed_to_range.sh b/src/test/filestore/run_seed_to_range.sh
index c5b399d7aae..365b34918d2 100755
--- a/src/test/filestore/run_seed_to_range.sh
+++ b/src/test/filestore/run_seed_to_range.sh
@@ -12,7 +12,7 @@ mydir=`dirname $0`
for f in `seq $from $to`
do
if ! $mydir/run_seed_to.sh $seed $f; then
- if -d $dir; then
+ if [ -d $dir ]; then
echo copying evidence to $dir
cp -a . $dir
else
diff --git a/src/test/filestore/store_test.cc b/src/test/filestore/store_test.cc
index 92104960127..50450f467ff 100644
--- a/src/test/filestore/store_test.cc
+++ b/src/test/filestore/store_test.cc
@@ -51,9 +51,9 @@ public:
}
};
-bool sorted(const vector<hobject_t> &in) {
- hobject_t start;
- for (vector<hobject_t>::const_iterator i = in.begin();
+bool sorted(const vector<ghobject_t> &in) {
+ ghobject_t start;
+ for (vector<ghobject_t>::const_iterator i = in.begin();
i != in.end();
++i) {
if (start > *i) return false;
@@ -105,7 +105,7 @@ TEST_F(StoreTest, SimpleObjectTest) {
r = store->apply_transaction(t);
ASSERT_EQ(r, 0);
}
- hobject_t hoid(sobject_t("Object 1", CEPH_NOSNAP));
+ ghobject_t hoid(hobject_t(sobject_t("Object 1", CEPH_NOSNAP)));
{
ObjectStore::Transaction t;
t.touch(cid, hoid);
@@ -133,7 +133,7 @@ TEST_F(StoreTest, SimpleObjectLongnameTest) {
r = store->apply_transaction(t);
ASSERT_EQ(r, 0);
}
- hobject_t hoid(sobject_t("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaObjectaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa 1", CEPH_NOSNAP));
+ ghobject_t hoid(hobject_t(sobject_t("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaObjectaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa 1", CEPH_NOSNAP)));
{
ObjectStore::Transaction t;
t.touch(cid, hoid);
@@ -157,7 +157,7 @@ TEST_F(StoreTest, ManyObjectTest) {
coll_t cid("blah");
string base = "";
for (int i = 0; i < 100; ++i) base.append("aaaaa");
- set<hobject_t> created;
+ set<ghobject_t> created;
{
ObjectStore::Transaction t;
t.create_collection(cid);
@@ -171,27 +171,27 @@ TEST_F(StoreTest, ManyObjectTest) {
ObjectStore::Transaction t;
char buf[100];
snprintf(buf, sizeof(buf), "%d", i);
- hobject_t hoid(sobject_t(string(buf) + base, CEPH_NOSNAP));
+ ghobject_t hoid(hobject_t(sobject_t(string(buf) + base, CEPH_NOSNAP)));
t.touch(cid, hoid);
created.insert(hoid);
r = store->apply_transaction(t);
ASSERT_EQ(r, 0);
}
- for (set<hobject_t>::iterator i = created.begin();
+ for (set<ghobject_t>::iterator i = created.begin();
i != created.end();
++i) {
struct stat buf;
ASSERT_TRUE(!store->stat(cid, *i, &buf));
}
- set<hobject_t> listed;
- vector<hobject_t> objects;
+ set<ghobject_t> listed;
+ vector<ghobject_t> objects;
r = store->collection_list(cid, objects);
ASSERT_EQ(r, 0);
cerr << "objects.size() is " << objects.size() << std::endl;
- for (vector<hobject_t> ::iterator i = objects.begin();
+ for (vector<ghobject_t> ::iterator i = objects.begin();
i != objects.end();
++i) {
listed.insert(*i);
@@ -199,11 +199,11 @@ TEST_F(StoreTest, ManyObjectTest) {
}
ASSERT_TRUE(listed.size() == created.size());
- hobject_t start, next;
+ ghobject_t start, next;
objects.clear();
r = store->collection_list_partial(
cid,
- hobject_t::get_max(),
+ ghobject_t::get_max(),
50,
60,
0,
@@ -234,13 +234,13 @@ TEST_F(StoreTest, ManyObjectTest) {
}
cerr << "listed.size() is " << listed.size() << std::endl;
ASSERT_TRUE(listed.size() == created.size());
- for (set<hobject_t>::iterator i = listed.begin();
+ for (set<ghobject_t>::iterator i = listed.begin();
i != listed.end();
++i) {
ASSERT_TRUE(created.count(*i));
}
- for (set<hobject_t>::iterator i = created.begin();
+ for (set<ghobject_t>::iterator i = created.begin();
i != created.end();
++i) {
ObjectStore::Transaction t;
@@ -259,7 +259,7 @@ TEST_F(StoreTest, ManyObjectTest) {
class ObjectGenerator {
public:
- virtual hobject_t create_object(gen_type *gen) = 0;
+ virtual ghobject_t create_object(gen_type *gen) = 0;
virtual ~ObjectGenerator() {}
};
@@ -267,7 +267,7 @@ class MixedGenerator : public ObjectGenerator {
public:
unsigned seq;
MixedGenerator() : seq(0) {}
- hobject_t create_object(gen_type *gen) {
+ ghobject_t create_object(gen_type *gen) {
char buf[100];
snprintf(buf, sizeof(buf), "%u", seq);
@@ -283,7 +283,7 @@ public:
// hash
//boost::binomial_distribution<uint32_t> bin(0xFFFFFF, 0.5);
++seq;
- return hobject_t(name, string(), rand() & 2 ? CEPH_NOSNAP : rand(), rand() & 0xFF, 0, "");
+ return ghobject_t(hobject_t(name, string(), rand() & 2 ? CEPH_NOSNAP : rand(), rand() & 0xFF, 0, ""));
}
};
@@ -293,8 +293,8 @@ public:
static const unsigned max_objects = 3000;
coll_t cid;
unsigned in_flight;
- set<hobject_t> available_objects;
- set<hobject_t> in_use_objects;
+ set<ghobject_t> available_objects;
+ set<ghobject_t> in_use_objects;
ObjectGenerator *object_gen;
gen_type *rng;
ObjectStore *store;
@@ -307,9 +307,9 @@ public:
public:
SyntheticWorkloadState *state;
ObjectStore::Transaction *t;
- hobject_t hoid;
+ ghobject_t hoid;
C_SyntheticOnReadable(SyntheticWorkloadState *state,
- ObjectStore::Transaction *t, hobject_t hoid)
+ ObjectStore::Transaction *t, ghobject_t hoid)
: state(state), t(t), hoid(hoid) {}
void finish(int r) {
@@ -339,14 +339,14 @@ public:
return store->apply_transaction(t);
}
- hobject_t get_uniform_random_object() {
+ ghobject_t get_uniform_random_object() {
while (in_flight >= max_in_flight || available_objects.empty())
cond.Wait(lock);
boost::uniform_int<> choose(0, available_objects.size() - 1);
int index = choose(*rng);
- set<hobject_t>::iterator i = available_objects.begin();
+ set<ghobject_t>::iterator i = available_objects.begin();
for ( ; index > 0; --index, ++i) ;
- hobject_t ret = *i;
+ ghobject_t ret = *i;
available_objects.erase(i);
return ret;
}
@@ -375,7 +375,7 @@ public:
if (!can_create())
return -ENOSPC;
wait_for_ready();
- hobject_t new_obj = object_gen->create_object(rng);
+ ghobject_t new_obj = object_gen->create_object(rng);
in_use_objects.insert(new_obj);
available_objects.erase(new_obj);
ObjectStore::Transaction *t = new ObjectStore::Transaction;
@@ -388,9 +388,9 @@ public:
Mutex::Locker locker(lock);
while (in_flight)
cond.Wait(lock);
- vector<hobject_t> objects;
- set<hobject_t> objects_set, objects_set2;
- hobject_t next, current;
+ vector<ghobject_t> objects;
+ set<ghobject_t> objects_set, objects_set2;
+ ghobject_t next, current;
while (1) {
cerr << "scanning..." << std::endl;
int r = store->collection_list_partial(cid, current, 50, 100,
@@ -403,7 +403,7 @@ public:
current = next;
}
ASSERT_EQ(objects_set.size(), available_objects.size());
- for (set<hobject_t>::iterator i = objects_set.begin();
+ for (set<ghobject_t>::iterator i = objects_set.begin();
i != objects_set.end();
++i) {
ASSERT_GT(available_objects.count(*i), (unsigned)0);
@@ -413,7 +413,7 @@ public:
ASSERT_EQ(r, 0);
objects_set2.insert(objects.begin(), objects.end());
ASSERT_EQ(objects_set2.size(), available_objects.size());
- for (set<hobject_t>::iterator i = objects_set2.begin();
+ for (set<ghobject_t>::iterator i = objects_set2.begin();
i != objects_set2.end();
++i) {
ASSERT_GT(available_objects.count(*i), (unsigned)0);
@@ -421,7 +421,7 @@ public:
}
int stat() {
- hobject_t hoid;
+ ghobject_t hoid;
{
Mutex::Locker locker(lock);
if (!can_unlink())
@@ -446,7 +446,7 @@ public:
Mutex::Locker locker(lock);
if (!can_unlink())
return -ENOENT;
- hobject_t to_remove = get_uniform_random_object();
+ ghobject_t to_remove = get_uniform_random_object();
ObjectStore::Transaction *t = new ObjectStore::Transaction;
t->remove(cid, to_remove);
++in_flight;
@@ -505,7 +505,7 @@ TEST_F(StoreTest, HashCollisionTest) {
}
string base = "";
for (int i = 0; i < 100; ++i) base.append("aaaaa");
- set<hobject_t> created;
+ set<ghobject_t> created;
for (int n = 0; n < 10; ++n) {
char nbuf[100];
sprintf(nbuf, "n%d", n);
@@ -515,7 +515,7 @@ TEST_F(StoreTest, HashCollisionTest) {
if (!(i % 5)) {
cerr << "Object n" << n << " "<< i << std::endl;
}
- hobject_t hoid(string(buf) + base, string(), CEPH_NOSNAP, 0, 0, string(nbuf));
+ ghobject_t hoid(hobject_t(string(buf) + base, string(), CEPH_NOSNAP, 0, 0, string(nbuf)));
{
ObjectStore::Transaction t;
t.touch(cid, hoid);
@@ -525,21 +525,21 @@ TEST_F(StoreTest, HashCollisionTest) {
created.insert(hoid);
}
}
- vector<hobject_t> objects;
+ vector<ghobject_t> objects;
r = store->collection_list(cid, objects);
ASSERT_EQ(r, 0);
- set<hobject_t> listed(objects.begin(), objects.end());
+ set<ghobject_t> listed(objects.begin(), objects.end());
cerr << "listed.size() is " << listed.size() << " and created.size() is " << created.size() << std::endl;
ASSERT_TRUE(listed.size() == created.size());
objects.clear();
listed.clear();
- hobject_t current, next;
+ ghobject_t current, next;
while (1) {
r = store->collection_list_partial(cid, current, 50, 60,
0, &objects, &next);
ASSERT_EQ(r, 0);
ASSERT_TRUE(sorted(objects));
- for (vector<hobject_t>::iterator i = objects.begin();
+ for (vector<ghobject_t>::iterator i = objects.begin();
i != objects.end();
++i) {
if (listed.count(*i))
@@ -555,13 +555,13 @@ TEST_F(StoreTest, HashCollisionTest) {
}
cerr << "listed.size() is " << listed.size() << std::endl;
ASSERT_TRUE(listed.size() == created.size());
- for (set<hobject_t>::iterator i = listed.begin();
+ for (set<ghobject_t>::iterator i = listed.begin();
i != listed.end();
++i) {
ASSERT_TRUE(created.count(*i));
}
- for (set<hobject_t>::iterator i = created.begin();
+ for (set<ghobject_t>::iterator i = created.begin();
i != created.end();
++i) {
ObjectStore::Transaction t;
@@ -576,7 +576,7 @@ TEST_F(StoreTest, HashCollisionTest) {
TEST_F(StoreTest, OMapTest) {
coll_t cid("blah");
- hobject_t hoid("tesomap", "", CEPH_NOSNAP, 0, 0, "");
+ ghobject_t hoid(hobject_t("tesomap", "", CEPH_NOSNAP, 0, 0, ""));
int r;
{
ObjectStore::Transaction t;
@@ -672,7 +672,7 @@ TEST_F(StoreTest, OMapTest) {
TEST_F(StoreTest, XattrTest) {
coll_t cid("blah");
- hobject_t hoid("tesomap", "", CEPH_NOSNAP, 0, 0, "");
+ ghobject_t hoid(hobject_t("tesomap", "", CEPH_NOSNAP, 0, 0, ""));
bufferlist big;
for (unsigned i = 0; i < 10000; ++i) {
big.append('\0');
@@ -769,12 +769,12 @@ void colsplittest(
for (uint32_t i = 0; i < 2*num_objects; ++i) {
stringstream objname;
objname << "obj" << i;
- t.touch(cid, hobject_t(
+ t.touch(cid, ghobject_t(hobject_t(
objname.str(),
"",
CEPH_NOSNAP,
i<<common_suffix_size,
- 0, ""));
+ 0, "")));
}
r = store->apply_transaction(t);
ASSERT_EQ(r, 0);
@@ -788,14 +788,14 @@ void colsplittest(
}
ObjectStore::Transaction t;
- vector<hobject_t> objects;
+ vector<ghobject_t> objects;
r = store->collection_list(cid, objects);
ASSERT_EQ(r, 0);
ASSERT_EQ(objects.size(), num_objects);
- for (vector<hobject_t>::iterator i = objects.begin();
+ for (vector<ghobject_t>::iterator i = objects.begin();
i != objects.end();
++i) {
- ASSERT_EQ(!(i->hash & (1<<common_suffix_size)), 0u);
+ ASSERT_EQ(!(i->hobj.hash & (1<<common_suffix_size)), 0u);
t.remove(cid, *i);
}
@@ -803,10 +803,10 @@ void colsplittest(
r = store->collection_list(tid, objects);
ASSERT_EQ(r, 0);
ASSERT_EQ(objects.size(), num_objects);
- for (vector<hobject_t>::iterator i = objects.begin();
+ for (vector<ghobject_t>::iterator i = objects.begin();
i != objects.end();
++i) {
- ASSERT_EQ(i->hash & (1<<common_suffix_size), 0u);
+ ASSERT_EQ(i->hobj.hash & (1<<common_suffix_size), 0u);
t.remove(tid, *i);
}
@@ -848,12 +848,12 @@ TEST_F(StoreTest, TwoHash) {
std::cout << "Making objects" << std::endl;
for (int i = 0; i < 360; ++i) {
ObjectStore::Transaction t;
- hobject_t o;
+ ghobject_t o;
if (i < 8) {
- o.hash = (i << 16) | 0xA1;
+ o.hobj.hash = (i << 16) | 0xA1;
t.touch(cid, o);
}
- o.hash = (i << 16) | 0xB1;
+ o.hobj.hash = (i << 16) | 0xB1;
t.touch(cid, o);
r = store->apply_transaction(t);
ASSERT_EQ(r, 0);
@@ -861,8 +861,8 @@ TEST_F(StoreTest, TwoHash) {
std::cout << "Removing half" << std::endl;
for (int i = 1; i < 8; ++i) {
ObjectStore::Transaction t;
- hobject_t o;
- o.hash = (i << 16) | 0xA1;
+ ghobject_t o;
+ o.hobj.hash = (i << 16) | 0xA1;
t.remove(cid, o);
r = store->apply_transaction(t);
ASSERT_EQ(r, 0);
@@ -870,24 +870,24 @@ TEST_F(StoreTest, TwoHash) {
std::cout << "Checking" << std::endl;
for (int i = 1; i < 8; ++i) {
ObjectStore::Transaction t;
- hobject_t o;
- o.hash = (i << 16) | 0xA1;
+ ghobject_t o;
+ o.hobj.hash = (i << 16) | 0xA1;
bool exists = store->exists(cid, o);
ASSERT_EQ(exists, false);
}
{
- hobject_t o;
- o.hash = 0xA1;
+ ghobject_t o;
+ o.hobj.hash = 0xA1;
bool exists = store->exists(cid, o);
ASSERT_EQ(exists, true);
}
std::cout << "Cleanup" << std::endl;
for (int i = 0; i < 360; ++i) {
ObjectStore::Transaction t;
- hobject_t o;
- o.hash = (i << 16) | 0xA1;
+ ghobject_t o;
+ o.hobj.hash = (i << 16) | 0xA1;
t.remove(cid, o);
- o.hash = (i << 16) | 0xB1;
+ o.hobj.hash = (i << 16) | 0xB1;
t.remove(cid, o);
r = store->apply_transaction(t);
ASSERT_EQ(r, 0);
diff --git a/src/test/filestore/workload_generator.cc b/src/test/filestore/workload_generator.cc
index 496379d7ad1..704d93021e2 100644
--- a/src/test/filestore/workload_generator.cc
+++ b/src/test/filestore/workload_generator.cc
@@ -344,12 +344,12 @@ void WorkloadGenerator::do_destroy_collection(ObjectStore::Transaction *t,
{
m_nr_runs.set(0);
entry->m_osr.flush();
- vector<hobject_t> ls;
+ vector<ghobject_t> ls;
m_store->collection_list(entry->m_coll, ls);
dout(2) << __func__ << " coll " << entry->m_coll
<< " (" << ls.size() << " objects)" << dendl;
- for (vector<hobject_t>::iterator it = ls.begin(); it < ls.end(); ++it) {
+ for (vector<ghobject_t>::iterator it = ls.begin(); it < ls.end(); ++it) {
t->remove(entry->m_coll, *it);
}
diff --git a/src/test/librados/misc.cc b/src/test/librados/misc.cc
index 803c8b1cc77..9abac9c412a 100644
--- a/src/test/librados/misc.cc
+++ b/src/test/librados/misc.cc
@@ -538,21 +538,25 @@ TEST(LibRadosMisc, BigAttrPP) {
bufferlist got;
- bl.clear();
- got.clear();
- bl.append(buffer::create(g_conf->osd_max_attr_size));
- ASSERT_EQ(0, ioctx.setxattr("foo", "one", bl));
- ASSERT_EQ((int)bl.length(), ioctx.getxattr("foo", "one", got));
- ASSERT_TRUE(bl.contents_equal(got));
+ if (g_conf->osd_max_attr_size) {
+ bl.clear();
+ got.clear();
+ bl.append(buffer::create(g_conf->osd_max_attr_size));
+ ASSERT_EQ(0, ioctx.setxattr("foo", "one", bl));
+ ASSERT_EQ((int)bl.length(), ioctx.getxattr("foo", "one", got));
+ ASSERT_TRUE(bl.contents_equal(got));
- bl.clear();
- bl.append(buffer::create(g_conf->osd_max_attr_size+1));
- ASSERT_EQ(-EFBIG, ioctx.setxattr("foo", "one", bl));
+ bl.clear();
+ bl.append(buffer::create(g_conf->osd_max_attr_size+1));
+ ASSERT_EQ(-EFBIG, ioctx.setxattr("foo", "one", bl));
+ } else {
+ cout << "osd_max_attr_size == 0; skipping test" << std::endl;
+ }
for (int i=0; i<1000; i++) {
bl.clear();
got.clear();
- bl.append(buffer::create(g_conf->osd_max_attr_size));
+ bl.append(buffer::create(MIN(g_conf->osd_max_attr_size, 1024)));
char n[10];
snprintf(n, sizeof(n), "a%d", i);
ASSERT_EQ(0, ioctx.setxattr("foo", n, bl));
@@ -643,6 +647,60 @@ TEST(LibRadosMisc, CopyPP) {
ASSERT_EQ(0, destroy_one_pool_pp(pool_name, cluster));
}
+TEST(LibRadosMisc, Dirty) {
+ Rados cluster;
+ std::string pool_name = get_temp_pool_name();
+ ASSERT_EQ("", create_one_pool_pp(pool_name, cluster));
+ IoCtx ioctx;
+ ASSERT_EQ(0, cluster.ioctx_create(pool_name.c_str(), ioctx));
+
+ {
+ ObjectWriteOperation op;
+ op.create(true);
+ ASSERT_EQ(0, ioctx.operate("foo", &op));
+ }
+ {
+ bool dirty = false;
+ int r = -1;
+ ObjectReadOperation op;
+ op.is_dirty(&dirty, &r);
+ ASSERT_EQ(0, ioctx.operate("foo", &op, NULL));
+ ASSERT_TRUE(dirty);
+ ASSERT_EQ(0, r);
+ }
+ {
+ ObjectWriteOperation op;
+ op.undirty();
+ ASSERT_EQ(0, ioctx.operate("foo", &op));
+ }
+ {
+ bool dirty = false;
+ int r = -1;
+ ObjectReadOperation op;
+ op.is_dirty(&dirty, &r);
+ ASSERT_EQ(0, ioctx.operate("foo", &op, NULL));
+ ASSERT_FALSE(dirty);
+ ASSERT_EQ(0, r);
+ }
+ {
+ ObjectWriteOperation op;
+ op.truncate(0); // still a write even tho it is a no-op
+ ASSERT_EQ(0, ioctx.operate("foo", &op));
+ }
+ {
+ bool dirty = false;
+ int r = -1;
+ ObjectReadOperation op;
+ op.is_dirty(&dirty, &r);
+ ASSERT_EQ(0, ioctx.operate("foo", &op, NULL));
+ ASSERT_TRUE(dirty);
+ ASSERT_EQ(0, r);
+ }
+
+ ioctx.close();
+ ASSERT_EQ(0, destroy_one_pool_pp(pool_name, cluster));
+}
+
int main(int argc, char **argv)
{
::testing::InitGoogleTest(&argc, argv);
diff --git a/src/test/os/TestFlatIndex.cc b/src/test/os/TestFlatIndex.cc
index 6db4f6c4aa5..53d2bbe6376 100644
--- a/src/test/os/TestFlatIndex.cc
+++ b/src/test/os/TestFlatIndex.cc
@@ -49,8 +49,8 @@ TEST(FlatIndex, collection) {
uint64_t hash = 111;
uint64_t pool = 222;
const std::string object_name(10, 'A');
- hobject_t hoid(object_t(object_name), key, CEPH_NOSNAP, hash, pool, "");
- vector<hobject_t> ls;
+ ghobject_t hoid(hobject_t(object_t(object_name), key, CEPH_NOSNAP, hash, pool, ""));
+ vector<ghobject_t> ls;
ASSERT_DEATH(index.collection_list_partial(hoid, 0, 0, 0, &ls, &hoid), "0");
}
@@ -70,7 +70,7 @@ TEST(FlatIndex, created_unlink) {
CollectionIndex::IndexedPath indexed_path;
index->set_ref(index);
const std::string object_name(10, 'A');
- hobject_t hoid(object_t(object_name), key, CEPH_NOSNAP, hash, pool, "");
+ ghobject_t hoid(hobject_t(object_t(object_name), key, CEPH_NOSNAP, hash, pool, ""));
int exists;
EXPECT_EQ(0, index->lookup(hoid, &indexed_path, &exists));
EXPECT_EQ(0, exists);
@@ -88,7 +88,7 @@ TEST(FlatIndex, created_unlink) {
CollectionIndex::IndexedPath indexed_path;
index->set_ref(index);
const std::string object_name(1024, 'A');
- hobject_t hoid(object_t(object_name), key, CEPH_NOSNAP, hash, pool, "");
+ ghobject_t hoid(hobject_t(object_t(object_name), key, CEPH_NOSNAP, hash, pool, ""));
int exists;
EXPECT_EQ(0, index->lookup(hoid, &indexed_path, &exists));
EXPECT_EQ(0, exists);
@@ -110,10 +110,10 @@ TEST(FlatIndex, collection_list) {
const std::string filename("PATH/" + object_name + "_head");
EXPECT_EQ(0, ::close(::creat(filename.c_str(), 0600)));
std::tr1::shared_ptr<CollectionIndex> index(new FlatIndex(collection, base_path));
- vector<hobject_t> ls;
+ vector<ghobject_t> ls;
index->collection_list(&ls);
EXPECT_EQ((unsigned)1, ls.size());
- EXPECT_EQ(object_name, ls[0].oid.name);
+ EXPECT_EQ(object_name, ls[0].hobj.oid.name);
EXPECT_EQ(0, ::system("rm -fr PATH"));
}
diff --git a/src/test/os/TestLFNIndex.cc b/src/test/os/TestLFNIndex.cc
index 3947329d995..02578eb4a71 100644
--- a/src/test/os/TestLFNIndex.cc
+++ b/src/test/os/TestLFNIndex.cc
@@ -45,10 +45,10 @@ public:
std::tr1::shared_ptr<CollectionIndex> dest
) { return 0; }
- void test_generate_and_parse(const hobject_t &hoid, const std::string &mangled_expected) {
+ void test_generate_and_parse(const ghobject_t &hoid, const std::string &mangled_expected) {
const std::string mangled_name = lfn_generate_object_name(hoid);
EXPECT_EQ(mangled_expected, mangled_name);
- hobject_t hoid_parsed;
+ ghobject_t hoid_parsed;
EXPECT_TRUE(lfn_parse_object_name(mangled_name, &hoid_parsed));
EXPECT_EQ(hoid, hoid_parsed);
}
@@ -58,34 +58,34 @@ protected:
virtual int _created(
const vector<string> &path,
- const hobject_t &hoid,
+ const ghobject_t &hoid,
const string &mangled_name
) { return 0; }
virtual int _remove(
const vector<string> &path,
- const hobject_t &hoid,
+ const ghobject_t &hoid,
const string &mangled_name
) { return 0; }
virtual int _lookup(
- const hobject_t &hoid,
+ const ghobject_t &hoid,
vector<string> *path,
string *mangled_name,
int *exists
) { return 0; }
virtual int _collection_list(
- vector<hobject_t> *ls
+ vector<ghobject_t> *ls
) { return 0; }
virtual int _collection_list_partial(
- const hobject_t &start,
+ const ghobject_t &start,
int min_count,
int max_count,
snapid_t seq,
- vector<hobject_t> *ls,
- hobject_t *next
+ vector<ghobject_t> *ls,
+ ghobject_t *next
) { return 0; }
};
@@ -101,9 +101,9 @@ TEST_F(TestHASH_INDEX_TAG, generate_and_parse_name) {
uint64_t hash = 0xABABABAB;
uint64_t pool = -1;
- test_generate_and_parse(hobject_t(object_t(".A/B_\\C.D"), key, CEPH_NOSNAP, hash, pool, ""),
+ test_generate_and_parse(ghobject_t(hobject_t(object_t(".A/B_\\C.D"), key, CEPH_NOSNAP, hash, pool, "")),
"\\.A\\sB_\\\\C.D_head_ABABABAB");
- test_generate_and_parse(hobject_t(object_t("DIR_A"), key, CEPH_NOSNAP, hash, pool, ""),
+ test_generate_and_parse(ghobject_t(hobject_t(object_t("DIR_A"), key, CEPH_NOSNAP, hash, pool, "")),
"\\dA_head_ABABABAB");
}
@@ -123,11 +123,11 @@ TEST_F(TestHASH_INDEX_TAG_2, generate_and_parse_name) {
{
std::string name(".XA/B_\\C.D");
name[1] = '\0';
- hobject_t hoid(object_t(name), key, CEPH_NOSNAP, hash, pool, "");
+ ghobject_t hoid(hobject_t(object_t(name), key, CEPH_NOSNAP, hash, pool, ""));
test_generate_and_parse(hoid, "\\.\\nA\\sB\\u\\\\C.D_KEY_head_ABABABAB");
}
- test_generate_and_parse(hobject_t(object_t("DIR_A"), key, CEPH_NOSNAP, hash, pool, ""),
+ test_generate_and_parse(ghobject_t(hobject_t(object_t("DIR_A"), key, CEPH_NOSNAP, hash, pool, "")),
"\\dA_KEY_head_ABABABAB");
}
@@ -143,21 +143,37 @@ TEST_F(TestHOBJECT_WITH_POOL, generate_and_parse_name) {
const std::string key("KEY");
uint64_t hash = 0xABABABAB;
uint64_t pool = 0xCDCDCDCD;
+ int64_t gen = 0xefefefefef;
+ int8_t shard_id = 0xb;
{
std::string name(".XA/B_\\C.D");
name[1] = '\0';
- hobject_t hoid(object_t(name), key, CEPH_NOSNAP, hash, pool, "");
- hoid.nspace = "NSPACE";
+ ghobject_t hoid(hobject_t(object_t(name), key, CEPH_NOSNAP, hash, pool, ""));
+ hoid.hobj.nspace = "NSPACE";
test_generate_and_parse(hoid, "\\.\\nA\\sB\\u\\\\C.D_KEY_head_ABABABAB_NSPACE_cdcdcdcd");
}
{
- hobject_t hoid(object_t("DIR_A"), key, CEPH_NOSNAP, hash, pool, "");
- hoid.nspace = "NSPACE";
+ ghobject_t hoid(hobject_t(object_t("DIR_A"), key, CEPH_NOSNAP, hash, pool, ""));
+ hoid.hobj.nspace = "NSPACE";
test_generate_and_parse(hoid, "\\dA_KEY_head_ABABABAB_NSPACE_cdcdcdcd");
}
+ {
+ std::string name(".XA/B_\\C.D");
+ name[1] = '\0';
+ ghobject_t hoid(hobject_t(object_t(name), key, CEPH_NOSNAP, hash, pool, ""), gen, shard_id);
+ hoid.hobj.nspace = "NSPACE";
+
+ test_generate_and_parse(hoid, "\\.\\nA\\sB\\u\\\\C.D_KEY_head_ABABABAB_NSPACE_cdcdcdcd_efefefefef_b");
+ }
+ {
+ ghobject_t hoid(hobject_t(object_t("DIR_A"), key, CEPH_NOSNAP, hash, pool, ""), gen, shard_id);
+ hoid.hobj.nspace = "NSPACE";
+
+ test_generate_and_parse(hoid, "\\dA_KEY_head_ABABABAB_NSPACE_cdcdcdcd_efefefefef_b");
+ }
}
class TestLFNIndex : public TestWrapLFNIndex, public ::testing::Test {
@@ -167,12 +183,12 @@ public:
virtual void SetUp() {
::chmod("PATH", 0700);
- ::system("rm -fr PATH");
- ::mkdir("PATH", 0700);
+ ASSERT_EQ(0, ::system("rm -fr PATH"));
+ ASSERT_EQ(0, ::mkdir("PATH", 0700));
}
virtual void TearDown() {
- ::system("rm -fr PATH");
+ ASSERT_EQ(0, ::system("rm -fr PATH"));
}
};
@@ -185,7 +201,7 @@ TEST_F(TestLFNIndex, remove_object) {
{
std::string mangled_name;
int exists = 666;
- hobject_t hoid(sobject_t("ABC", CEPH_NOSNAP));
+ ghobject_t hoid(hobject_t(sobject_t("ABC", CEPH_NOSNAP)));
EXPECT_EQ(0, ::chmod("PATH", 0000));
EXPECT_EQ(-EACCES, remove_object(path, hoid));
@@ -205,7 +221,7 @@ TEST_F(TestLFNIndex, remove_object) {
std::string mangled_name;
int exists;
const std::string object_name(1024, 'A');
- hobject_t hoid(sobject_t(object_name, CEPH_NOSNAP));
+ ghobject_t hoid(hobject_t(sobject_t(object_name, CEPH_NOSNAP)));
EXPECT_EQ(0, get_mangled_name(path, hoid, &mangled_name, &exists));
EXPECT_EQ(0, exists);
@@ -226,7 +242,7 @@ TEST_F(TestLFNIndex, remove_object) {
std::string mangled_name;
int exists;
const std::string object_name(1024, 'A');
- hobject_t hoid(sobject_t(object_name, CEPH_NOSNAP));
+ ghobject_t hoid(hobject_t(sobject_t(object_name, CEPH_NOSNAP)));
//
// PATH/AAA..._0_long => does not match long object name
@@ -275,7 +291,7 @@ TEST_F(TestLFNIndex, remove_object) {
std::string mangled_name;
int exists;
const std::string object_name(1024, 'A');
- hobject_t hoid(sobject_t(object_name, CEPH_NOSNAP));
+ ghobject_t hoid(hobject_t(sobject_t(object_name, CEPH_NOSNAP)));
//
// PATH/AAA..._0_long => matches long object name
@@ -323,7 +339,7 @@ TEST_F(TestLFNIndex, get_mangled_name) {
{
std::string mangled_name;
int exists = 666;
- hobject_t hoid(sobject_t("ABC", CEPH_NOSNAP));
+ ghobject_t hoid(hobject_t(sobject_t("ABC", CEPH_NOSNAP)));
EXPECT_EQ(0, get_mangled_name(path, hoid, &mangled_name, &exists));
EXPECT_NE(std::string::npos, mangled_name.find("ABC__head"));
@@ -343,7 +359,7 @@ TEST_F(TestLFNIndex, get_mangled_name) {
std::string mangled_name;
int exists;
const std::string object_name(1024, 'A');
- hobject_t hoid(sobject_t(object_name, CEPH_NOSNAP));
+ ghobject_t hoid(hobject_t(sobject_t(object_name, CEPH_NOSNAP)));
//
// long version of the mangled name and no matching
@@ -441,6 +457,11 @@ int main(int argc, char **argv) {
}
}
-// Local Variables:
-// compile-command: "cd ../.. ; make unittest_lfnindex ; valgrind --tool=memcheck ./unittest_lfnindex # --gtest_filter=TestLFNIndex.* --log-to-stderr=true --debug-filestore=20"
-// End:
+/*
+ * Local Variables:
+ * compile-command: "cd ../.. ;
+ * make unittest_lfnindex &&
+ * valgrind --tool=memcheck ./unittest_lfnindex \
+ * # --gtest_filter=TestLFNIndex.* --log-to-stderr=true --debug-filestore=20"
+ * End:
+ */
diff --git a/src/test/osd/ErasureCodeExample.h b/src/test/osd/ErasureCodeExample.h
index 95d79feb923..0fd55187559 100644
--- a/src/test/osd/ErasureCodeExample.h
+++ b/src/test/osd/ErasureCodeExample.h
@@ -34,17 +34,6 @@
class ErasureCodeExample : public ErasureCodeInterface {
public:
- useconds_t delay;
- ErasureCodeExample(const map<std::string,std::string> &parameters) :
- delay(0)
- {
- if (parameters.find("usleep") != parameters.end()) {
- std::istringstream ss(parameters.find("usleep")->second);
- ss >> delay;
- usleep(delay);
- }
- }
-
virtual ~ErasureCodeExample() {}
virtual int minimum_to_decode(const set<int> &want_to_read,
diff --git a/src/test/osd/ErasureCodePluginExample.cc b/src/test/osd/ErasureCodePluginExample.cc
index 1543b1cdaed..6ae61c0a18d 100644
--- a/src/test/osd/ErasureCodePluginExample.cc
+++ b/src/test/osd/ErasureCodePluginExample.cc
@@ -14,6 +14,8 @@
*
*/
+#include <unistd.h>
+
#include "osd/ErasureCodePlugin.h"
#include "ErasureCodeExample.h"
@@ -22,7 +24,7 @@ public:
virtual int factory(const map<std::string,std::string> &parameters,
ErasureCodeInterfaceRef *erasure_code)
{
- *erasure_code = ErasureCodeInterfaceRef(new ErasureCodeExample(parameters));
+ *erasure_code = ErasureCodeInterfaceRef(new ErasureCodeExample());
return 0;
}
};
diff --git a/src/test/osd/ErasureCodePluginHangs.cc b/src/test/osd/ErasureCodePluginHangs.cc
new file mode 100644
index 00000000000..ea73786b526
--- /dev/null
+++ b/src/test/osd/ErasureCodePluginHangs.cc
@@ -0,0 +1,24 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com>
+ *
+ * Author: Loic Dachary <loic@dachary.org>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <unistd.h>
+#include "osd/ErasureCodePlugin.h"
+
+int __erasure_code_init(char *plugin_name)
+{
+ sleep(1000);
+ return 0;
+}
diff --git a/src/test/osd/Object.cc b/src/test/osd/Object.cc
index 408cc63ac02..d4be4df1bdd 100644
--- a/src/test/osd/Object.cc
+++ b/src/test/osd/Object.cc
@@ -9,10 +9,11 @@
ostream &operator<<(ostream &out, const ContDesc &rhs)
{
- return out << "ObjNum: " << rhs.objnum
- << " snap: " << rhs.cursnap
- << " seqnum: " << rhs.seqnum
- << " prefix: " << rhs.prefix;
+ return out << "(ObjNum " << rhs.objnum
+ << " snap " << rhs.cursnap
+ << " seq_num " << rhs.seqnum
+ //<< " prefix " << rhs.prefix
+ << ")";
}
void VarLenGenerator::get_ranges(const ContDesc &cont, interval_set<uint64_t> &out) {
diff --git a/src/test/osd/RadosModel.h b/src/test/osd/RadosModel.h
index ab0b13d73e4..aba6a531c6f 100644
--- a/src/test/osd/RadosModel.h
+++ b/src/test/osd/RadosModel.h
@@ -143,6 +143,7 @@ public:
map<int, map<string,ObjectDesc> > pool_obj_cont;
set<string> oid_in_use;
set<string> oid_not_in_use;
+ set<int> snaps_in_use;
int current_snap;
string pool_name;
librados::IoCtx io_ctx;
@@ -403,8 +404,8 @@ public:
void update_object_full(const string &oid, const ObjectDesc &contents)
{
- pool_obj_cont.rbegin()->second.erase(oid);
- pool_obj_cont.rbegin()->second.insert(pair<string,ObjectDesc>(oid, contents));
+ pool_obj_cont[current_snap].erase(oid);
+ pool_obj_cont[current_snap].insert(pair<string,ObjectDesc>(oid, contents));
}
void update_object_version(const string &oid, uint64_t version)
@@ -416,7 +417,7 @@ public:
map<string,ObjectDesc>::iterator j = i->second.find(oid);
if (j != i->second.end()) {
j->second.version = version;
- cout << __func__ << " oid " << oid << " is version " << version << std::endl;
+ cout << __func__ << " oid " << oid << " v " << version << " " << j->second.most_recent() << std::endl;
break;
}
}
@@ -792,25 +793,12 @@ public:
context->oid_in_use.insert(oid);
context->oid_not_in_use.erase(oid);
- vector<uint64_t> snapset(context->snaps.size());
- int j = 0;
- for (map<int,uint64_t>::reverse_iterator i = context->snaps.rbegin();
- i != context->snaps.rend();
- ++i, ++j) {
- snapset[j] = i->second;
- }
interval_set<uint64_t> ranges;
context->cont_gen.get_ranges(cont, ranges);
std::cout << num << ": seq_num " << context->seq_num << " ranges " << ranges << std::endl;
context->seq_num++;
context->state_lock.Unlock();
- int r = context->io_ctx.selfmanaged_snap_set_write_ctx(context->seq, snapset);
- if (r) {
- cerr << " r is " << r << " snapset is " << snapset << " seq is " << context->seq << std::endl;
- assert(0);
- }
-
waiting_on = ranges.num_intervals();
//cout << " waiting_on = " << waiting_on << std::endl;
ContentsGenerator::iterator gen_pos = context->cont_gen.get_iterator(cont);
@@ -921,23 +909,10 @@ public:
context->remove_object(oid);
- vector<uint64_t> snapset(context->snaps.size());
- int j = 0;
- for (map<int,uint64_t>::reverse_iterator i = context->snaps.rbegin();
- i != context->snaps.rend();
- ++i, ++j) {
- snapset[j] = i->second;
- }
interval_set<uint64_t> ranges;
context->state_lock.Unlock();
- int r = context->io_ctx.selfmanaged_snap_set_write_ctx(context->seq, snapset);
- if (r) {
- cerr << "r is " << r << " snapset is " << snapset << " seq is " << context->seq << std::endl;
- assert(0);
- }
-
- r = context->io_ctx.remove(context->prefix+oid);
+ int r = context->io_ctx.remove(context->prefix+oid);
if (r && !(r == -ENOENT && !present)) {
cerr << "r is " << r << " while deleting " << oid << " and present is " << present << std::endl;
assert(0);
@@ -1069,8 +1044,10 @@ public:
if (!(err == -ENOENT && old_value.deleted())) {
cerr << num << ": Error: oid " << oid << " read returned error code "
<< err << std::endl;
+ context->errors++;
}
} else {
+ cout << num << ": expect " << old_value.most_recent() << std::endl;
assert(!old_value.deleted());
if (old_value.has_contents()) {
ContDesc to_check;
@@ -1271,17 +1248,8 @@ public:
context->oid_in_use.insert(oid);
context->oid_not_in_use.erase(oid);
- vector<uint64_t> snapset(context->snaps.size());
- int j = 0;
- for (map<int,uint64_t>::reverse_iterator i = context->snaps.rbegin();
- i != context->snaps.rend();
- ++i, ++j) {
- snapset[j] = i->second;
- }
-
TestWatchContext *ctx = context->get_watch_context(oid);
context->state_lock.Unlock();
- assert(!context->io_ctx.selfmanaged_snap_set_write_ctx(context->seq, snapset));
int r;
if (!ctx) {
{
@@ -1348,18 +1316,12 @@ public:
}
context->oid_in_use.insert(oid);
context->oid_not_in_use.erase(oid);
+ context->snaps_in_use.insert(roll_back_to);
+
context->roll_back(oid, roll_back_to);
uint64_t snap = context->snaps[roll_back_to];
- vector<uint64_t> snapset(context->snaps.size());
- int j = 0;
- for (map<int,uint64_t>::reverse_iterator i = context->snaps.rbegin();
- i != context->snaps.rend();
- ++i, ++j) {
- snapset[j] = i->second;
- }
context->state_lock.Unlock();
- assert(!context->io_ctx.selfmanaged_snap_set_write_ctx(context->seq, snapset));
op.selfmanaged_snap_rollback(snap);
@@ -1383,6 +1345,7 @@ public:
context->update_object_version(oid, comp->get_version64());
context->oid_in_use.erase(oid);
context->oid_not_in_use.insert(oid);
+ context->snaps_in_use.erase(roll_back_to);
context->kick();
}
@@ -1402,6 +1365,7 @@ public:
string oid, oid_src;
ObjectDesc src_value;
librados::ObjectWriteOperation op;
+ librados::ObjectReadOperation rd_op;
librados::AioCompletion *comp;
librados::AioCompletion *comp_racing_read;
int snap;
@@ -1439,6 +1403,8 @@ public:
snap = -1;
}
context->find_object(oid_src, &src_value, snap);
+ if (!src_value.deleted())
+ context->update_object_full(oid, src_value);
string src = context->prefix+oid_src;
op.copy_from(src.c_str(), context->io_ctx, src_value.version);
@@ -1456,7 +1422,12 @@ public:
new TestOp::CallbackInfo(1));
comp_racing_read = context->rados.aio_create_completion((void*) read_cb_arg, &write_callback,
NULL);
- context->io_ctx.aio_stat(context->prefix+oid, comp_racing_read, NULL, NULL);
+ rd_op.stat(NULL, NULL, NULL);
+ context->io_ctx.aio_operate(context->prefix+oid, comp_racing_read, &rd_op,
+ librados::SNAP_HEAD,
+ librados::OPERATION_ORDER_READS_WRITES, // order wrt previous write/update
+ NULL);
+
}
void _finish(CallbackInfo *info)
@@ -1472,14 +1443,16 @@ public:
assert(comp->is_complete());
cout << num << ": finishing copy_from to " << context->prefix + oid << std::endl;
if ((r = comp->get_return_value())) {
- if (!(r == -ENOENT && src_value.deleted())) {
+ if (r == -ENOENT && src_value.deleted()) {
+ cout << num << ": got expected ENOENT (src dne)" << std::endl;
+ } else {
cerr << "Error: oid " << oid << " copy_from " << oid_src << " returned error code "
<< r << std::endl;
+ assert(0);
}
} else {
assert(!version || comp->get_version64() == version);
version = comp->get_version64();
- context->update_object_full(oid, src_value);
context->update_object_version(oid, comp->get_version64());
}
} else if (info->id == 1) {
diff --git a/src/test/osd/TestErasureCodeExample.cc b/src/test/osd/TestErasureCodeExample.cc
index 6866dfdbb9f..f12e80c8cd0 100644
--- a/src/test/osd/TestErasureCodeExample.cc
+++ b/src/test/osd/TestErasureCodeExample.cc
@@ -20,24 +20,9 @@
#include "global/global_context.h"
#include "gtest/gtest.h"
-TEST(ErasureCodeExample, constructor)
-{
- map<std::string,std::string> parameters;
- {
- ErasureCodeExample example(parameters);
- EXPECT_EQ(0u, example.delay);
- }
- parameters["usleep"] = "10";
- {
- ErasureCodeExample example(parameters);
- EXPECT_EQ(10u, example.delay);
- }
-}
-
TEST(ErasureCodeExample, minimum_to_decode)
{
- map<std::string,std::string> parameters;
- ErasureCodeExample example(parameters);
+ ErasureCodeExample example;
set<int> available_chunks;
set<int> want_to_read;
want_to_read.insert(1);
@@ -72,8 +57,7 @@ TEST(ErasureCodeExample, minimum_to_decode)
TEST(ErasureCodeExample, minimum_to_decode_with_cost)
{
- map<std::string,std::string> parameters;
- ErasureCodeExample example(parameters);
+ ErasureCodeExample example;
map<int,int> available;
set<int> want_to_read;
want_to_read.insert(1);
@@ -117,8 +101,7 @@ TEST(ErasureCodeExample, minimum_to_decode_with_cost)
TEST(ErasureCodeExample, encode_decode)
{
- map<std::string,std::string> parameters;
- ErasureCodeExample example(parameters);
+ ErasureCodeExample example;
bufferlist in;
in.append("ABCDE");
diff --git a/src/test/osd/TestErasureCodeJerasure.cc b/src/test/osd/TestErasureCodeJerasure.cc
index 22aaff7e5fa..a51cb853c86 100644
--- a/src/test/osd/TestErasureCodeJerasure.cc
+++ b/src/test/osd/TestErasureCodeJerasure.cc
@@ -47,9 +47,19 @@ TYPED_TEST(ErasureCodeTest, encode_decode)
parameters["erasure-code-packetsize"] = "8";
jerasure.init(parameters);
+#define LARGE_ENOUGH 2048
+ bufferptr in_ptr(LARGE_ENOUGH);
+ in_ptr.zero();
+ in_ptr.set_length(0);
+ const char *payload =
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789";
+ in_ptr.append(payload, strlen(payload));
bufferlist in;
- for (int i = 0; i < 5; i++)
- in.append("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789");
+ in.push_front(in_ptr);
int want_to_encode[] = { 0, 1, 2, 3 };
map<int, bufferlist> encoded;
EXPECT_EQ(0, jerasure.encode(set<int>(want_to_encode, want_to_encode+4),
@@ -58,7 +68,8 @@ TYPED_TEST(ErasureCodeTest, encode_decode)
EXPECT_EQ(4u, encoded.size());
unsigned length = encoded[0].length();
EXPECT_EQ(0, strncmp(encoded[0].c_str(), in.c_str(), length));
- EXPECT_EQ(0, strncmp(encoded[1].c_str(), in.c_str() + length, in.length() - length));
+ EXPECT_EQ(0, strncmp(encoded[1].c_str(), in.c_str() + length,
+ in.length() - length));
// all chunks are available
@@ -72,7 +83,8 @@ TYPED_TEST(ErasureCodeTest, encode_decode)
EXPECT_EQ(4u, decoded.size());
EXPECT_EQ(length, decoded[0].length());
EXPECT_EQ(0, strncmp(decoded[0].c_str(), in.c_str(), length));
- EXPECT_EQ(0, strncmp(decoded[1].c_str(), in.c_str() + length, in.length() - length));
+ EXPECT_EQ(0, strncmp(decoded[1].c_str(), in.c_str() + length,
+ in.length() - length));
}
// two chunks are missing
@@ -90,7 +102,8 @@ TYPED_TEST(ErasureCodeTest, encode_decode)
EXPECT_EQ(4u, decoded.size());
EXPECT_EQ(length, decoded[0].length());
EXPECT_EQ(0, strncmp(decoded[0].c_str(), in.c_str(), length));
- EXPECT_EQ(0, strncmp(decoded[1].c_str(), in.c_str() + length, in.length() - length));
+ EXPECT_EQ(0, strncmp(decoded[1].c_str(), in.c_str() + length,
+ in.length() - length));
}
}
@@ -192,6 +205,84 @@ TYPED_TEST(ErasureCodeTest, minimum_to_decode)
}
}
+TEST(ErasureCodeTest, encode)
+{
+ ErasureCodeJerasureReedSolomonVandermonde jerasure;
+ map<std::string,std::string> parameters;
+ parameters["erasure-code-k"] = "2";
+ parameters["erasure-code-m"] = "2";
+ parameters["erasure-code-w"] = "8";
+ jerasure.init(parameters);
+
+ unsigned alignment = jerasure.get_alignment();
+ {
+ //
+ // When the input bufferlist is perfectly aligned, it is
+ // pointed to unmodified by the returned encoded chunks.
+ //
+ bufferlist in;
+ map<int,bufferlist> encoded;
+ int want_to_encode[] = { 0, 1, 2, 3 };
+ in.append(string(alignment * 2, 'X'));
+ EXPECT_EQ(alignment * 2, in.length());
+ EXPECT_EQ(0, jerasure.encode(set<int>(want_to_encode, want_to_encode+4),
+ in,
+ &encoded));
+ EXPECT_EQ(4u, encoded.size());
+ for(int i = 0; i < 4; i++)
+ EXPECT_EQ(alignment, encoded[i].length());
+ EXPECT_EQ(in.c_str(), encoded[0].c_str());
+ EXPECT_EQ(in.c_str() + alignment, encoded[1].c_str());
+ }
+
+ {
+ //
+ // When the input bufferlist needs to be padded because
+ // it is not properly aligned, it is padded with zeros.
+ // The beginning of the input bufferlist is pointed to
+ // unmodified by the returned encoded chunk, only the
+ // trailing chunk is allocated and copied.
+ //
+ bufferlist in;
+ map<int,bufferlist> encoded;
+ int want_to_encode[] = { 0, 1, 2, 3 };
+ int trail_length = 10;
+ in.append(string(alignment + trail_length, 'X'));
+ EXPECT_EQ(0, jerasure.encode(set<int>(want_to_encode, want_to_encode+4),
+ in,
+ &encoded));
+ EXPECT_EQ(4u, encoded.size());
+ for(int i = 0; i < 4; i++)
+ EXPECT_EQ(alignment, encoded[i].length());
+ EXPECT_EQ(in.c_str(), encoded[0].c_str());
+ EXPECT_NE(in.c_str() + alignment, encoded[1].c_str());
+ char *last_chunk = encoded[1].c_str();
+ EXPECT_EQ('X', last_chunk[0]);
+ EXPECT_EQ('\0', last_chunk[trail_length]);
+ }
+
+ {
+ //
+ // When only the first chunk is required, the encoded map only
+ // contains the first chunk. Although the jerasure encode
+ // internally allocated a buffer because of padding requirements
+ // and also computes the coding chunks, they are released before
+ // the return of the method, as shown when running the tests thru
+ // valgrind that shows there is no leak.
+ //
+ bufferlist in;
+ map<int,bufferlist> encoded;
+ set<int> want_to_encode;
+ want_to_encode.insert(0);
+ int trail_length = 10;
+ in.append(string(alignment + trail_length, 'X'));
+ EXPECT_EQ(0, jerasure.encode(want_to_encode, in, &encoded));
+ EXPECT_EQ(1u, encoded.size());
+ EXPECT_EQ(alignment, encoded[0].length());
+ EXPECT_EQ(in.c_str(), encoded[0].c_str());
+ }
+}
+
int main(int argc, char **argv)
{
vector<const char*> args;
@@ -204,6 +295,12 @@ int main(int argc, char **argv)
return RUN_ALL_TESTS();
}
-// Local Variables:
-// compile-command: "cd ../.. ; make -j4 && make unittest_erasure_code_jerasure && valgrind --tool=memcheck ./unittest_erasure_code_jerasure --gtest_filter=*.* --log-to-stderr=true --debug-osd=20"
-// End:
+/*
+ * Local Variables:
+ * compile-command: "cd ../.. ; make -j4 &&
+ * make unittest_erasure_code_jerasure &&
+ * valgrind --tool=memcheck --leak-check=full \
+ * ./unittest_erasure_code_jerasure \
+ * --gtest_filter=*.* --log-to-stderr=true --debug-osd=20"
+ * End:
+ */
diff --git a/src/test/osd/TestErasureCodePlugin.cc b/src/test/osd/TestErasureCodePlugin.cc
index ba7d13fbd2d..46ed4b1730d 100644
--- a/src/test/osd/TestErasureCodePlugin.cc
+++ b/src/test/osd/TestErasureCodePlugin.cc
@@ -28,19 +28,12 @@ protected:
class Thread_factory : public Thread {
public:
- useconds_t delay;
-
- Thread_factory(useconds_t _delay) :
- delay(_delay)
- {}
-
virtual void *entry() {
map<std::string,std::string> parameters;
parameters["erasure-code-directory"] = ".libs";
- parameters["usleep"] = delay;
ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
ErasureCodeInterfaceRef erasure_code;
- instance.factory("example", parameters, &erasure_code);
+ instance.factory("hangs", parameters, &erasure_code);
return NULL;
}
};
@@ -58,7 +51,7 @@ TEST_F(ErasureCodePluginRegistryTest, factory_mutex) {
//
useconds_t delay = 0;
const useconds_t DELAY_MAX = 20 * 1000 * 1000;
- Thread_factory sleep_forever(1024 * 1024 * 1024);
+ Thread_factory sleep_forever;
sleep_forever.create();
do {
cout << "Trying (1) with delay " << delay << "us\n";
@@ -71,7 +64,8 @@ TEST_F(ErasureCodePluginRegistryTest, factory_mutex) {
EXPECT_FALSE(instance.lock.TryLock());
- EXPECT_EQ(0, sleep_forever.detach());
+ EXPECT_EQ(0, pthread_cancel(sleep_forever.get_thread_id()));
+ EXPECT_EQ(0, sleep_forever.join());
}
TEST_F(ErasureCodePluginRegistryTest, all)
diff --git a/src/test/osd/TestErasureCodePluginJerasure.cc b/src/test/osd/TestErasureCodePluginJerasure.cc
index fe819c71a39..2f558937595 100644
--- a/src/test/osd/TestErasureCodePluginJerasure.cc
+++ b/src/test/osd/TestErasureCodePluginJerasure.cc
@@ -51,7 +51,8 @@ TEST(ErasureCodePlugin, factory)
}
}
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
+{
vector<const char*> args;
argv_to_vec(argc, (const char **)argv, args);
@@ -62,6 +63,12 @@ int main(int argc, char **argv) {
return RUN_ALL_TESTS();
}
-// Local Variables:
-// compile-command: "cd ../.. ; make -j4 && make unittest_erasure_code_jerasure_plugin && valgrind --tool=memcheck ./unittest_erasure_code_jerasure_plugin --gtest_filter=*.* --log-to-stderr=true --debug-osd=20"
-// End:
+/*
+ * Local Variables:
+ * compile-command: "cd ../.. ; make -j4 &&
+ * make unittest_erasure_code_plugin_jerasure &&
+ * valgrind --tool=memcheck ./unittest_erasure_code_plugin_jerasure \
+ * --gtest_filter=*.* --log-to-stderr=true --debug-osd=20"
+ * End:
+ */
+
diff --git a/src/test/osd/TestRados.cc b/src/test/osd/TestRados.cc
index be919161579..7158f50a74a 100644
--- a/src/test/osd/TestRados.cc
+++ b/src/test/osd/TestRados.cc
@@ -120,13 +120,16 @@ private:
}
case TEST_OP_ROLLBACK:
- if (context.snaps.empty()) {
+ if (context.snaps.size() <= context.snaps_in_use.size()) {
return NULL;
- } else {
+ }
+ while (true) {
int snap = rand_choose(context.snaps)->first;
+ if (context.snaps_in_use.count(snap))
+ continue; // in use; try again!
string oid = *(rand_choose(context.oid_not_in_use));
cout << "rollback oid " << oid << " to " << snap << std::endl;
- return new RollbackOp(m_op, &context, oid, snap);
+ return new RollbackOp(m_op, &context, oid, snap);
}
case TEST_OP_SETATTR:
diff --git a/src/test/pybind/test_ceph_argparse.py b/src/test/pybind/test_ceph_argparse.py
new file mode 100755
index 00000000000..34bcf698e5a
--- /dev/null
+++ b/src/test/pybind/test_ceph_argparse.py
@@ -0,0 +1,1061 @@
+#!/usr/bin/nosetests --nocapture
+# -*- mode:python; tab-width:4; indent-tabs-mode:t -*-
+# vim: ts=4 sw=4 smarttab expandtab
+#
+# Ceph - scalable distributed file system
+#
+# Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com>
+#
+# Author: Loic Dachary <loic@dachary.org>
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+
+from nose.tools import eq_ as eq
+from nose.tools import *
+
+from ceph_argparse import validate_command, parse_json_funcsigs
+
+import os
+import re
+import json
+
+def get_command_descriptions(what):
+ buffer = os.popen("./get_command_descriptions " + "--" + what
+ + " 2>&1 | grep cmd000").read()
+ return re.sub(r'^.*?(\{.*\})', '\g<1>', buffer)
+
+def test_parse_json_funcsigs():
+ commands = get_command_descriptions("all")
+ cmd_json = parse_json_funcsigs(commands, 'cli')
+
+ # syntax error https://github.com/ceph/ceph/pull/585
+ commands = get_command_descriptions("pull585")
+ assert_raises(TypeError, parse_json_funcsigs, commands, 'cli')
+
+sigdict = parse_json_funcsigs(get_command_descriptions("all"), 'cli')
+
+
+class TestArgparse:
+
+ def assert_valid_command(self, args):
+ result = validate_command(sigdict, args)
+ assert_not_in(result, [None, {}])
+
+ def check_1_natural_arg(self, prefix, command):
+ self.assert_valid_command([prefix, command, '1'])
+ assert_equal({}, validate_command(sigdict, [prefix, command]))
+ assert_equal({}, validate_command(sigdict, [prefix, command, '-1']))
+ assert_equal({}, validate_command(sigdict, [prefix, command, '1',
+ '1']))
+
+ def check_0_or_1_natural_arg(self, prefix, command):
+ self.assert_valid_command([prefix, command, '1'])
+ self.assert_valid_command([prefix, command])
+ assert_equal({}, validate_command(sigdict, [prefix, command, '-1']))
+ assert_equal({}, validate_command(sigdict, [prefix, command, '1',
+ '1']))
+
+ def check_1_string_arg(self, prefix, command):
+ assert_equal({}, validate_command(sigdict, [prefix, command]))
+ self.assert_valid_command([prefix, command, 'string'])
+ assert_equal({}, validate_command(sigdict, [prefix,
+ command,
+ 'string',
+ 'toomany']))
+
+ def check_1_or_more_string_args(self, prefix, command):
+ assert_equal({}, validate_command(sigdict, [prefix,
+ command]))
+ self.assert_valid_command([prefix,
+ command,
+ 'string'])
+ self.assert_valid_command([prefix,
+ command,
+ 'string',
+ 'more string'])
+
+ def check_no_arg(self, prefix, command):
+ self.assert_valid_command([prefix,
+ command])
+ assert_equal({}, validate_command(sigdict, [prefix,
+ command,
+ 'toomany']))
+
+
+class TestPG(TestArgparse):
+
+ def test_stat(self):
+ self.assert_valid_command(['pg', 'stat'])
+
+ def test_getmap(self):
+ self.assert_valid_command(['pg', 'getmap'])
+
+ def test_send_pg_creates(self):
+ self.assert_valid_command(['pg', 'send_pg_creates'])
+
+ def test_dump(self):
+ self.assert_valid_command(['pg', 'dump'])
+ self.assert_valid_command(['pg', 'dump',
+ 'all',
+ 'summary',
+ 'sum',
+ 'delta',
+ 'pools',
+ 'osds',
+ 'pgs',
+ 'pgs_brief'])
+ assert_equal({}, validate_command(sigdict, ['pg', 'dump', 'invalid']))
+
+ def test_dump_json(self):
+ self.assert_valid_command(['pg', 'dump_json'])
+ self.assert_valid_command(['pg', 'dump_json',
+ 'all',
+ 'summary',
+ 'sum',
+ 'pools',
+ 'osds',
+ 'pgs'])
+ assert_equal({}, validate_command(sigdict, ['pg', 'dump_json',
+ 'invalid']))
+
+ def test_dump_pools_json(self):
+ self.assert_valid_command(['pg', 'dump_pools_json'])
+
+ def test_dump_pools_stuck(self):
+ self.assert_valid_command(['pg', 'dump_stuck'])
+ self.assert_valid_command(['pg', 'dump_stuck',
+ 'inactive',
+ 'unclean',
+ 'stale'])
+ assert_equal({}, validate_command(sigdict, ['pg', 'dump_stuck',
+ 'invalid']))
+ self.assert_valid_command(['pg', 'dump_stuck',
+ 'inactive',
+ '1234'])
+
+ def one_pgid(self, command):
+ self.assert_valid_command(['pg', command, '1.1'])
+ assert_equal({}, validate_command(sigdict, ['pg', command]))
+ assert_equal({}, validate_command(sigdict, ['pg', command, '1']))
+
+ def test_map(self):
+ self.one_pgid('map')
+
+ def test_scrub(self):
+ self.one_pgid('scrub')
+
+ def test_deep_scrub(self):
+ self.one_pgid('deep-scrub')
+
+ def test_repair(self):
+ self.one_pgid('repair')
+
+ def test_debug(self):
+ self.assert_valid_command(['pg',
+ 'debug',
+ 'unfound_objects_exist'])
+ self.assert_valid_command(['pg',
+ 'debug',
+ 'degraded_pgs_exist'])
+ assert_equal({}, validate_command(sigdict, ['pg', 'debug']))
+ assert_equal({}, validate_command(sigdict, ['pg', 'debug',
+ 'invalid']))
+
+ def test_force_create_pg(self):
+ self.one_pgid('force_create_pg')
+
+ def set_ratio(self, command):
+ self.assert_valid_command(['pg',
+ command,
+ '0.0'])
+ assert_equal({}, validate_command(sigdict, ['pg', command]))
+ assert_equal({}, validate_command(sigdict, ['pg',
+ command,
+ '2.0']))
+
+ def test_set_full_ratio(self):
+ self.set_ratio('set_full_ratio')
+
+ def test_set_nearfull_ratio(self):
+ self.set_ratio('set_nearfull_ratio')
+
+
+class TestAuth(TestArgparse):
+
+ def test_export(self):
+ self.assert_valid_command(['auth', 'export'])
+ self.assert_valid_command(['auth',
+ 'export',
+ 'string'])
+ assert_equal({}, validate_command(sigdict, ['auth',
+ 'export',
+ 'string',
+ 'toomany']))
+
+ def test_get(self):
+ self.check_1_string_arg('auth', 'get')
+
+ def test_get_key(self):
+ self.check_1_string_arg('auth', 'get-key')
+
+ def test_print_key(self):
+ self.check_1_string_arg('auth', 'print-key')
+ self.check_1_string_arg('auth', 'print_key')
+
+ def test_list(self):
+ self.check_no_arg('auth', 'list')
+
+ def test_import(self):
+ self.check_no_arg('auth', 'import')
+
+ def test_add(self):
+ self.check_1_or_more_string_args('auth', 'add')
+
+ def test_get_or_create_key(self):
+ self.check_1_or_more_string_args('auth', 'get-or-create-key')
+
+ def test_get_or_create(self):
+ self.check_1_or_more_string_args('auth', 'get-or-create')
+
+ def test_caps(self):
+ assert_equal({}, validate_command(sigdict, ['auth',
+ 'caps']))
+ assert_equal({}, validate_command(sigdict, ['auth',
+ 'caps',
+ 'string']))
+ self.assert_valid_command(['auth',
+ 'caps',
+ 'string',
+ 'more string'])
+
+ def test_del(self):
+ self.check_1_string_arg('auth', 'del')
+
+
+class TestMonitor(TestArgparse):
+
+ def test_compact(self):
+ self.assert_valid_command(['compact'])
+
+ def test_scrub(self):
+ self.assert_valid_command(['scrub'])
+
+ def test_fsid(self):
+ self.assert_valid_command(['fsid'])
+
+ def test_log(self):
+ assert_equal({}, validate_command(sigdict, ['log']))
+ self.assert_valid_command(['log', 'a logtext'])
+ self.assert_valid_command(['log', 'a logtext', 'and another'])
+
+ def test_injectargs(self):
+ assert_equal({}, validate_command(sigdict, ['injectargs']))
+ self.assert_valid_command(['injectargs', 'one'])
+ self.assert_valid_command(['injectargs', 'one', 'two'])
+
+ def test_status(self):
+ self.assert_valid_command(['status'])
+
+ def test_health(self):
+ self.assert_valid_command(['health'])
+ self.assert_valid_command(['health', 'detail'])
+ assert_equal({}, validate_command(sigdict, ['health', 'invalid']))
+ assert_equal({}, validate_command(sigdict, ['health', 'detail',
+ 'toomany']))
+
+ def test_df(self):
+ self.assert_valid_command(['df'])
+ self.assert_valid_command(['df', 'detail'])
+ assert_equal({}, validate_command(sigdict, ['df', 'invalid']))
+ assert_equal({}, validate_command(sigdict, ['df', 'detail',
+ 'toomany']))
+
+ def test_report(self):
+ self.assert_valid_command(['report'])
+ self.assert_valid_command(['report', 'tag1'])
+ self.assert_valid_command(['report', 'tag1', 'tag2'])
+
+ def test_quorum_status(self):
+ self.assert_valid_command(['quorum_status'])
+
+ def test_mon_status(self):
+ self.assert_valid_command(['mon_status'])
+
+ def test_sync_force(self):
+ self.assert_valid_command(['sync',
+ 'force',
+ '--yes-i-really-mean-it',
+ '--i-know-what-i-am-doing'])
+ self.assert_valid_command(['sync',
+ 'force',
+ '--yes-i-really-mean-it'])
+ self.assert_valid_command(['sync',
+ 'force'])
+ assert_equal({}, validate_command(sigdict, ['sync']))
+ assert_equal({}, validate_command(sigdict, ['sync',
+ 'force',
+ '--yes-i-really-mean-it',
+ '--i-know-what-i-am-doing',
+ 'toomany']))
+
+ def test_heap(self):
+ assert_equal({}, validate_command(sigdict, ['heap']))
+ assert_equal({}, validate_command(sigdict, ['heap', 'invalid']))
+ self.assert_valid_command(['heap', 'dump'])
+ self.assert_valid_command(['heap', 'start_profiler'])
+ self.assert_valid_command(['heap', 'stop_profiler'])
+ self.assert_valid_command(['heap', 'release'])
+ self.assert_valid_command(['heap', 'stats'])
+
+ def test_quorum(self):
+ assert_equal({}, validate_command(sigdict, ['quorum']))
+ assert_equal({}, validate_command(sigdict, ['quorum', 'invalid']))
+ self.assert_valid_command(['quorum', 'enter'])
+ self.assert_valid_command(['quorum', 'exit'])
+ assert_equal({}, validate_command(sigdict, ['quorum',
+ 'enter',
+ 'toomany']))
+
+ def test_tell(self):
+ assert_equal({}, validate_command(sigdict, ['tell']))
+ assert_equal({}, validate_command(sigdict, ['tell', 'invalid']))
+ for name in ('osd', 'mon', 'client', 'mds'):
+ assert_equal({}, validate_command(sigdict, ['tell', name]))
+ assert_equal({}, validate_command(sigdict, ['tell',
+ name + ".42"]))
+ self.assert_valid_command(['tell', name + ".42", 'something'])
+ self.assert_valid_command(['tell', name + ".42",
+ 'something',
+ 'something else'])
+
+
+class TestMDS(TestArgparse):
+
+ def test_stat(self):
+ self.check_no_arg('mds', 'stat')
+
+ def test_dump(self):
+ self.check_0_or_1_natural_arg('mds', 'dump')
+
+ def test_tell(self):
+ self.assert_valid_command(['mds', 'tell',
+ 'someone',
+ 'something'])
+ self.assert_valid_command(['mds', 'tell',
+ 'someone',
+ 'something',
+ 'something else'])
+ assert_equal({}, validate_command(sigdict, ['mds', 'tell']))
+ assert_equal({}, validate_command(sigdict, ['mds', 'tell',
+ 'someone']))
+
+ def test_compat_show(self):
+ self.assert_valid_command(['mds', 'compat', 'show'])
+ assert_equal({}, validate_command(sigdict, ['mds', 'compat']))
+ assert_equal({}, validate_command(sigdict, ['mds', 'compat',
+ 'show', 'toomany']))
+
+ def test_stop(self):
+ self.assert_valid_command(['mds', 'stop', 'someone'])
+ assert_equal({}, validate_command(sigdict, ['mds', 'stop']))
+ assert_equal({}, validate_command(sigdict, ['mds', 'stop',
+ 'someone', 'toomany']))
+
+ def test_deactivate(self):
+ self.assert_valid_command(['mds', 'deactivate', 'someone'])
+ assert_equal({}, validate_command(sigdict, ['mds', 'deactivate']))
+ assert_equal({}, validate_command(sigdict, ['mds', 'deactivate',
+ 'someone', 'toomany']))
+
+ def test_set_max_mds(self):
+ self.check_1_natural_arg('mds', 'set_max_mds')
+
+ def test_setmap(self):
+ self.check_1_natural_arg('mds', 'setmap')
+
+ def test_set_state(self):
+ self.assert_valid_command(['mds', 'set_state', '1', '2'])
+ assert_equal({}, validate_command(sigdict, ['mds', 'set_state']))
+ assert_equal({}, validate_command(sigdict, ['mds', 'set_state', '-1']))
+ assert_equal({}, validate_command(sigdict, ['mds', 'set_state',
+ '1', '-1']))
+ assert_equal({}, validate_command(sigdict, ['mds', 'set_state',
+ '1', '21']))
+
+ def test_fail(self):
+ self.check_1_string_arg('mds', 'fail')
+
+ def test_rm(self):
+ assert_equal({}, validate_command(sigdict, ['mds', 'rm']))
+ assert_equal({}, validate_command(sigdict, ['mds', 'rm', '1']))
+ for name in ('osd', 'mon', 'client', 'mds'):
+ self.assert_valid_command(['mds', 'rm', '1', name + '.42'])
+ assert_equal({}, validate_command(sigdict, ['mds', 'rm',
+ '-1', name + '.42']))
+ assert_equal({}, validate_command(sigdict, ['mds', 'rm',
+ '-1', name]))
+ assert_equal({}, validate_command(sigdict, ['mds', 'rm',
+ '1', name + '.42',
+ 'toomany']))
+
+ def test_rmfailed(self):
+ self.check_1_natural_arg('mds', 'rmfailed')
+
+ def test_cluster_down(self):
+ self.check_no_arg('mds', 'cluster_down')
+
+ def test_cluster_up(self):
+ self.check_no_arg('mds', 'cluster_up')
+
+ def test_compat_rm_compat(self):
+ self.assert_valid_command(['mds', 'compat', 'rm_compat', '1'])
+ assert_equal({}, validate_command(sigdict, ['mds',
+ 'compat',
+ 'rm_compat']))
+ assert_equal({}, validate_command(sigdict, ['mds',
+ 'compat',
+ 'rm_compat', '-1']))
+ assert_equal({}, validate_command(sigdict, ['mds',
+ 'compat',
+ 'rm_compat', '1', '1']))
+
+ def test_incompat_rm_incompat(self):
+ self.assert_valid_command(['mds', 'compat', 'rm_incompat', '1'])
+ assert_equal({}, validate_command(sigdict, ['mds',
+ 'compat',
+ 'rm_incompat']))
+ assert_equal({}, validate_command(sigdict, ['mds',
+ 'compat',
+ 'rm_incompat', '-1']))
+ assert_equal({}, validate_command(sigdict, ['mds',
+ 'compat',
+ 'rm_incompat', '1', '1']))
+
+ def test_mds_set(self):
+ self.assert_valid_command(['mds', 'set', 'allow_new_snaps'])
+ self.assert_valid_command(['mds', 'set', 'allow_new_snaps', 'sure'])
+ assert_equal({}, validate_command(sigdict, ['mds',
+ 'set',
+ 'invalid']))
+ assert_equal({}, validate_command(sigdict, ['mds',
+ 'set',
+ 'allow_new_snaps',
+ 'sure',
+ 'toomany']))
+
+ def test_mds_unset(self):
+ self.assert_valid_command(['mds', 'unset', 'allow_new_snaps'])
+ self.assert_valid_command(['mds', 'unset', 'allow_new_snaps', 'sure'])
+ assert_equal({}, validate_command(sigdict, ['mds',
+ 'unset',
+ 'invalid']))
+ assert_equal({}, validate_command(sigdict, ['mds',
+ 'unset',
+ 'allow_new_snaps',
+ 'sure',
+ 'toomany']))
+
+ def test_add_data_pool(self):
+ self.check_1_natural_arg('mds', 'add_data_pool')
+
+ def test_remove_data_pool(self):
+ self.check_1_natural_arg('mds', 'remove_data_pool')
+
+ def test_newfs(self):
+ self.assert_valid_command(['mds', 'newfs', '1', '2',
+ '--yes-i-really-mean-it'])
+ self.assert_valid_command(['mds', 'newfs', '1', '2'])
+ assert_equal({}, validate_command(sigdict, ['mds', 'newfs']))
+ assert_equal({}, validate_command(sigdict, ['mds', 'newfs', '1']))
+ assert_equal({}, validate_command(sigdict, ['mds',
+ 'newfs',
+ '1',
+ '2',
+ '--yes-i-really-mean-it',
+ 'toomany']))
+ assert_equal({}, validate_command(sigdict, ['mds',
+ 'newfs',
+ '-1',
+ '2',
+ '--yes-i-really-mean-it']))
+ assert_equal({}, validate_command(sigdict, ['mds',
+ 'newfs',
+ '1',
+ '-1',
+ '--yes-i-really-mean-it']))
+
+
+class TestMon(TestArgparse):
+
+ def test_dump(self):
+ self.check_0_or_1_natural_arg('mon', 'dump')
+
+ def test_stat(self):
+ self.check_no_arg('mon', 'stat')
+
+ def test_getmap(self):
+ self.check_0_or_1_natural_arg('mon', 'getmap')
+
+ def test_add(self):
+ self.assert_valid_command(['mon', 'add', 'name', '1.2.3.4:1234'])
+ assert_equal({}, validate_command(sigdict, ['mon', 'add']))
+ assert_equal({}, validate_command(sigdict, ['mon', 'add', 'name']))
+ assert_equal({}, validate_command(sigdict, ['mon', 'add',
+ 'name',
+ '400.500.600.700']))
+ assert_equal({}, validate_command(sigdict, ['mon', 'add', 'name',
+ '1.2.3.4:1234',
+ 'toomany']))
+
+ def test_remove(self):
+ self.assert_valid_command(['mon', 'remove', 'name'])
+ assert_equal({}, validate_command(sigdict, ['mon', 'remove']))
+ assert_equal({}, validate_command(sigdict, ['mon', 'remove',
+ 'name', 'toomany']))
+
+
+class TestOSD(TestArgparse):
+
+ def test_stat(self):
+ self.check_no_arg('osd', 'stat')
+
+ def test_dump(self):
+ self.check_0_or_1_natural_arg('osd', 'dump')
+
+ def test_osd_tree(self):
+ self.check_0_or_1_natural_arg('osd', 'tree')
+
+ def test_osd_ls(self):
+ self.check_0_or_1_natural_arg('osd', 'ls')
+
+ def test_osd_getmap(self):
+ self.check_0_or_1_natural_arg('osd', 'getmap')
+
+ def test_osd_getcrushmap(self):
+ self.check_0_or_1_natural_arg('osd', 'getcrushmap')
+
+ def test_perf(self):
+ self.check_no_arg('osd', 'perf')
+
+ def test_getmaxosd(self):
+ self.check_no_arg('osd', 'getmaxosd')
+
+ def test_find(self):
+ self.check_1_natural_arg('osd', 'find')
+
+ def test_map(self):
+ self.assert_valid_command(['osd', 'map', 'poolname', 'objectname'])
+ assert_equal({}, validate_command(sigdict, ['osd', 'map']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'map', 'poolname']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'map',
+ 'poolname', 'objectname',
+ 'toomany']))
+
+ def test_scrub(self):
+ self.check_1_string_arg('osd', 'scrub')
+
+ def test_deep_scrub(self):
+ self.check_1_string_arg('osd', 'deep-scrub')
+
+ def test_repair(self):
+ self.check_1_string_arg('osd', 'repair')
+
+ def test_lspools(self):
+ self.assert_valid_command(['osd', 'lspools'])
+ self.assert_valid_command(['osd', 'lspools', '1'])
+ self.assert_valid_command(['osd', 'lspools', '-1'])
+ assert_equal({}, validate_command(sigdict, ['osd', 'lspools',
+ '1', 'toomany']))
+
+ def test_blacklist_ls(self):
+ self.assert_valid_command(['osd', 'blacklist', 'ls'])
+ assert_equal({}, validate_command(sigdict, ['osd', 'blacklist']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'blacklist',
+ 'ls', 'toomany']))
+
+ def test_crush_rule(self):
+ assert_equal({}, validate_command(sigdict, ['osd', 'crush']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'crush', 'rule']))
+ for subcommand in ('list', 'ls', 'dump'):
+ self.assert_valid_command(['osd', 'crush', 'rule', subcommand])
+ assert_equal({}, validate_command(sigdict, ['osd', 'crush',
+ 'rule', subcommand,
+ 'toomany']))
+
+ def test_crush_dump(self):
+ self.assert_valid_command(['osd', 'crush', 'dump'])
+ assert_equal({}, validate_command(sigdict, ['osd', 'crush']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'crush',
+ 'dump', 'toomany']))
+
+ def test_setcrushmap(self):
+ self.check_no_arg('osd', 'setcrushmap')
+
+ def test_crush_add_bucket(self):
+ self.assert_valid_command(['osd', 'crush', 'add-bucket',
+ 'name', 'type'])
+ assert_equal({}, validate_command(sigdict, ['osd', 'crush']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'crush',
+ 'add-bucket']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'crush',
+ 'add-bucket', 'name',
+ 'type',
+ 'toomany']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'crush',
+ 'add-bucket', '!!!',
+ 'type']))
+
+ def check_crush_setter(self, setter):
+ self.assert_valid_command(['osd', 'crush', setter,
+ '*', '2.3', 'AZaz09-_.='])
+ self.assert_valid_command(['osd', 'crush', setter,
+ 'osd.0', '2.3', 'AZaz09-_.='])
+ self.assert_valid_command(['osd', 'crush', setter,
+ '0', '2.3', 'AZaz09-_.='])
+ self.assert_valid_command(['osd', 'crush', setter,
+ '0', '2.3', 'AZaz09-_.=', 'AZaz09-_.='])
+ assert_equal({}, validate_command(sigdict, ['osd', 'crush',
+ setter,
+ 'osd.0']))
+ assert_in(validate_command(sigdict, ['osd', 'crush',
+ setter,
+ 'osd.0',
+ '-1.0']),
+ [None, {}])
+ assert_equal({}, validate_command(sigdict, ['osd', 'crush',
+ setter,
+ 'osd.0',
+ '1.0',
+ '!!!']))
+
+ def test_crush_set(self):
+ assert_equal({}, validate_command(sigdict, ['osd', 'crush']))
+ self.check_crush_setter('set')
+
+ def test_crush_add(self):
+ assert_equal({}, validate_command(sigdict, ['osd', 'crush']))
+ self.check_crush_setter('add')
+
+ def test_crush_create_or_move(self):
+ assert_equal({}, validate_command(sigdict, ['osd', 'crush']))
+ self.check_crush_setter('create-or-move')
+
+ def test_crush_move(self):
+ self.assert_valid_command(['osd', 'crush', 'move',
+ 'AZaz09-_.', 'AZaz09-_.='])
+ self.assert_valid_command(['osd', 'crush', 'move',
+ '0', 'AZaz09-_.=', 'AZaz09-_.='])
+ assert_equal({}, validate_command(sigdict, ['osd', 'crush',
+ 'move']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'crush',
+ 'move', 'AZaz09-_.']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'crush',
+ 'move', '!!!',
+ 'AZaz09-_.=']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'crush',
+ 'move', 'AZaz09-_.',
+ '!!!']))
+
+ def test_crush_link(self):
+ self.assert_valid_command(['osd', 'crush', 'link',
+ 'name', 'AZaz09-_.='])
+ self.assert_valid_command(['osd', 'crush', 'link',
+ 'name', 'AZaz09-_.=', 'AZaz09-_.='])
+ assert_equal({}, validate_command(sigdict, ['osd', 'crush',
+ 'link']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'crush',
+ 'link',
+ 'name']))
+
+ def test_crush_rm(self):
+ for alias in ('rm', 'remove', 'unlink'):
+ self.assert_valid_command(['osd', 'crush', alias, 'AZaz09-_.'])
+ self.assert_valid_command(['osd', 'crush', alias,
+ 'AZaz09-_.', 'AZaz09-_.'])
+ assert_equal({}, validate_command(sigdict, ['osd', 'crush',
+ alias]))
+ assert_equal({}, validate_command(sigdict, ['osd', 'crush',
+ alias,
+ 'AZaz09-_.',
+ 'AZaz09-_.',
+ 'toomany']))
+
+ def test_crush_reweight(self):
+ self.assert_valid_command(['osd', 'crush', 'reweight',
+ 'AZaz09-_.', '2.3'])
+ assert_equal({}, validate_command(sigdict, ['osd', 'crush',
+ 'reweight']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'crush',
+ 'reweight',
+ 'AZaz09-_.']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'crush',
+ 'reweight',
+ 'AZaz09-_.',
+ '-1.0']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'crush',
+ 'reweight',
+ '!!!',
+ '2.3']))
+
+ def test_crush_tunables(self):
+ for tunable in ('legacy', 'argonaut', 'bobtail', 'optimal', 'default'):
+ self.assert_valid_command(['osd', 'crush', 'tunables',
+ tunable])
+ assert_equal({}, validate_command(sigdict, ['osd', 'crush',
+ 'tunables']))
+ assert_equal(None, validate_command(sigdict, ['osd', 'crush',
+ 'default', 'toomany']))
+
+ def test_crush_rule_create_simple(self):
+ self.assert_valid_command(['osd', 'crush', 'rule', 'create-simple',
+ 'AZaz09-_.', 'AZaz09-_.', 'AZaz09-_.'])
+ assert_equal(None, validate_command(sigdict, ['osd', 'crush',
+ 'create-simple']))
+ assert_equal(None, validate_command(sigdict, ['osd', 'crush',
+ 'create-simple',
+ 'AZaz09-_.']))
+ assert_equal(None, validate_command(sigdict, ['osd', 'crush',
+ 'create-simple',
+ 'AZaz09-_.',
+ 'AZaz09-_.']))
+ assert_equal(None, validate_command(sigdict, ['osd', 'crush',
+ 'create-simple',
+ '!!!',
+ 'AZaz09-_.',
+ 'AZaz09-_.']))
+ assert_equal(None, validate_command(sigdict, ['osd', 'crush',
+ 'create-simple',
+ 'AZaz09-_.',
+ '|||',
+ 'AZaz09-_.']))
+ assert_equal(None, validate_command(sigdict, ['osd', 'crush',
+ 'create-simple',
+ 'AZaz09-_.',
+ 'AZaz09-_.',
+ '+++']))
+ assert_equal(None, validate_command(sigdict, ['osd', 'crush',
+ 'create-simple',
+ 'AZaz09-_.',
+ 'AZaz09-_.',
+ 'AZaz09-_.',
+ 'toomany']))
+
+ def test_crush_rule_rm(self):
+ self.assert_valid_command(['osd', 'crush', 'rule', 'rm', 'AZaz09-_.'])
+ assert_equal({}, validate_command(sigdict, ['osd', 'crush',
+ 'rule', 'rm']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'crush',
+ 'rule', 'rm',
+ '!!!!']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'crush',
+ 'rule', 'rm',
+ 'AZaz09-_.',
+ 'toomany']))
+
+ def test_setmaxosd(self):
+ self.check_1_natural_arg('osd', 'setmaxosd')
+
+ def test_pause(self):
+ self.check_no_arg('osd', 'pause')
+
+ def test_unpause(self):
+ self.check_no_arg('osd', 'unpause')
+
+ def test_set_unset(self):
+ for action in ('set', 'unset'):
+ for flag in ('pause', 'noup', 'nodown', 'noout', 'noin',
+ 'nobackfill', 'norecover', 'noscrub', 'nodeep-scrub'):
+ self.assert_valid_command(['osd', action, flag])
+ assert_equal({}, validate_command(sigdict, ['osd', action]))
+ assert_equal({}, validate_command(sigdict, ['osd', action,
+ 'invalid']))
+ assert_equal({}, validate_command(sigdict, ['osd', action,
+ 'pause', 'toomany']))
+
+ def test_cluster_snap(self):
+ assert_equal(None, validate_command(sigdict, ['osd', 'cluster_snap']))
+
+ def test_down(self):
+ self.check_1_or_more_string_args('osd', 'down')
+
+ def test_out(self):
+ self.check_1_or_more_string_args('osd', 'out')
+
+ def test_in(self):
+ self.check_1_or_more_string_args('osd', 'in')
+
+ def test_rm(self):
+ self.check_1_or_more_string_args('osd', 'rm')
+
+ def test_reweight(self):
+ self.assert_valid_command(['osd', 'reweight', '1', '0.1'])
+ assert_equal({}, validate_command(sigdict, ['osd', 'reweight']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'reweight',
+ '1']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'reweight',
+ '1', '2.0']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'reweight',
+ '-1', '0.1']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'reweight',
+ '1', '0.1',
+ 'toomany']))
+
+ def test_lost(self):
+ self.assert_valid_command(['osd', 'lost', '1',
+ '--yes-i-really-mean-it'])
+ self.assert_valid_command(['osd', 'lost', '1'])
+ assert_equal({}, validate_command(sigdict, ['osd', 'lost']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'lost',
+ '1',
+ 'what?']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'lost',
+ '-1',
+ '--yes-i-really-mean-it']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'lost',
+ '1',
+ '--yes-i-really-mean-it',
+ 'toomany']))
+
+ def test_create(self):
+ uuid = '12345678123456781234567812345678'
+ self.assert_valid_command(['osd', 'create'])
+ self.assert_valid_command(['osd', 'create',
+ uuid])
+ assert_equal({}, validate_command(sigdict, ['osd', 'create',
+ 'invalid']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'create',
+ uuid,
+ 'toomany']))
+
+ def test_blackist(self):
+ for action in ('add', 'rm'):
+ self.assert_valid_command(['osd', 'blacklist', action,
+ '1.2.3.4/567'])
+ self.assert_valid_command(['osd', 'blacklist', action,
+ '1.2.3.4'])
+ self.assert_valid_command(['osd', 'blacklist', action,
+ '1.2.3.4/567', '600.40'])
+ self.assert_valid_command(['osd', 'blacklist', action,
+ '1.2.3.4', '600.40'])
+ assert_equal({}, validate_command(sigdict, ['osd', 'blacklist',
+ action,
+ 'invalid',
+ '600.40']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'blacklist',
+ action,
+ '1.2.3.4/567',
+ '-1.0']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'blacklist',
+ action,
+ '1.2.3.4/567',
+ '600.40',
+ 'toomany']))
+
+ def test_pool_mksnap(self):
+ self.assert_valid_command(['osd', 'pool', 'mksnap',
+ 'poolname', 'snapname'])
+ assert_equal({}, validate_command(sigdict, ['osd', 'pool', 'mksnap']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'pool', 'mksnap',
+ 'poolname']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'pool', 'mksnap',
+ 'poolname', 'snapname',
+ 'toomany']))
+
+ def test_pool_rmsnap(self):
+ self.assert_valid_command(['osd', 'pool', 'rmsnap',
+ 'poolname', 'snapname'])
+ assert_equal({}, validate_command(sigdict, ['osd', 'pool', 'rmsnap']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'pool', 'rmsnap',
+ 'poolname']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'pool', 'rmsnap',
+ 'poolname', 'snapname',
+ 'toomany']))
+
+ def test_pool_create(self):
+ self.assert_valid_command(['osd', 'pool', 'create',
+ 'poolname', '128'])
+ self.assert_valid_command(['osd', 'pool', 'create',
+ 'poolname', '128', '128'])
+ self.assert_valid_command(['osd', 'pool', 'create',
+ 'poolname', '128', '128',
+ 'foo=bar'])
+ self.assert_valid_command(['osd', 'pool', 'create',
+ 'poolname', '128', '128',
+ 'foo=bar', 'baz=frob'])
+ self.assert_valid_command(['osd', 'pool', 'create',
+ 'poolname', '128',
+ 'foo=bar', 'baz=frob'])
+ assert_equal({}, validate_command(sigdict, ['osd', 'pool', 'create']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'pool', 'create',
+ 'poolname']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'pool', 'create',
+ 'poolname', '-1']))
+
+ def test_pool_delete(self):
+ self.assert_valid_command(['osd', 'pool', 'delete',
+ 'poolname', 'poolname',
+ '--yes-i-really-really-mean-it'])
+ self.assert_valid_command(['osd', 'pool', 'delete',
+ 'poolname', 'poolname'])
+ self.assert_valid_command(['osd', 'pool', 'delete',
+ 'poolname'])
+ assert_equal({}, validate_command(sigdict, ['osd', 'pool', 'delete']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'pool', 'delete',
+ 'poolname', 'poolname',
+ 'not really']))
+ assert_equal({}, validate_command(sigdict,
+ ['osd', 'pool', 'delete',
+ 'poolname', 'poolname',
+ '--yes-i-really-really-mean-it',
+ 'toomany']))
+
+ def test_pool_rename(self):
+ self.assert_valid_command(['osd', 'pool', 'rename',
+ 'poolname', 'othername'])
+ assert_equal({}, validate_command(sigdict, ['osd', 'pool', 'rename']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'pool', 'rename',
+ 'poolname']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'pool', 'rename',
+ 'poolname', 'othername',
+ 'toomany']))
+
+ def test_pool_get(self):
+ for var in ('size', 'min_size', 'crash_replay_interval',
+ 'pg_num', 'pgp_num', 'crush_ruleset'):
+ self.assert_valid_command(['osd', 'pool', 'get', 'poolname', var])
+ assert_equal({}, validate_command(sigdict, ['osd', 'pool']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'pool',
+ 'get']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'pool',
+ 'get', 'poolname']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'pool',
+ 'get', 'poolname',
+ 'size', 'toomany']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'pool',
+ 'get', 'poolname',
+ 'invalid']))
+
+ def test_pool_set(self):
+ for var in ('size', 'min_size', 'crash_replay_interval',
+ 'pg_num', 'pgp_num', 'crush_ruleset'):
+ self.assert_valid_command(['osd', 'pool',
+ 'set', 'poolname', var, '-1'])
+ assert_equal({}, validate_command(sigdict, ['osd', 'pool',
+ 'set']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'pool',
+ 'set', 'poolname']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'pool',
+ 'set', 'poolname',
+ 'size', 'invalid']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'pool',
+ 'set', 'poolname',
+ 'invalid', '-1']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'pool',
+ 'set', 'poolname',
+ 'size', '-1',
+ 'toomany']))
+
+ def test_pool_set_quota(self):
+ for field in ('max_objects', 'max_bytes'):
+ self.assert_valid_command(['osd', 'pool', 'set-quota',
+ 'poolname', field, '10K'])
+ assert_equal({}, validate_command(sigdict, ['osd', 'pool',
+ 'set-quota']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'pool',
+ 'set-quota',
+ 'poolname']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'pool',
+ 'set-quota',
+ 'poolname',
+ 'max_objects']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'pool',
+ 'set-quota',
+ 'poolname',
+ 'invalid',
+ '10K']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'pool',
+ 'set-quota',
+ 'poolname',
+ 'max_objects',
+ '10K',
+ 'toomany']))
+
+ def test_reweight_by_utilization(self):
+ self.assert_valid_command(['osd', 'reweight-by-utilization'])
+ self.assert_valid_command(['osd', 'reweight-by-utilization', '100'])
+ assert_equal({}, validate_command(sigdict, ['osd',
+ 'reweight-by-utilization',
+ '50']))
+ assert_equal({}, validate_command(sigdict, ['osd',
+ 'reweight-by-utilization',
+ '100',
+ 'toomany']))
+
+ def test_thrash(self):
+ self.check_1_natural_arg('osd', 'thrash')
+
+ def test_tier_op(self):
+ for op in ('add', 'remove', 'set-overlay'):
+ self.assert_valid_command(['osd', 'tier', op,
+ 'poolname', 'othername'])
+ assert_equal({}, validate_command(sigdict, ['osd', 'tier', op]))
+ assert_equal({}, validate_command(sigdict, ['osd', 'tier', op,
+ 'poolname']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'tier', op,
+ 'poolname',
+ 'othername',
+ 'toomany']))
+
+ def test_tier_cache_mode(self):
+ for mode in ('none', 'writeback', 'invalidate+forward', 'readonly'):
+ self.assert_valid_command(['osd', 'tier', 'cache-mode',
+ 'poolname', mode])
+ assert_equal({}, validate_command(sigdict, ['osd', 'tier',
+ 'cache-mode']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'tier',
+ 'cache-mode',
+ 'invalid']))
+
+ def test_tier_remove_overlay(self):
+ self.assert_valid_command(['osd', 'tier', 'remove-overlay',
+ 'poolname'])
+ assert_equal({}, validate_command(sigdict, ['osd', 'tier',
+ 'remove-overlay']))
+ assert_equal({}, validate_command(sigdict, ['osd', 'tier',
+ 'remove-overlay',
+ 'poolname',
+ 'toomany']))
+
+
+class TestConfigKey(TestArgparse):
+
+ def test_get(self):
+ self.check_1_string_arg('config-key', 'get')
+
+ def test_put(self):
+ self.assert_valid_command(['config-key', 'put',
+ 'key'])
+ self.assert_valid_command(['config-key', 'put',
+ 'key', 'value'])
+ assert_equal({}, validate_command(sigdict, ['config-key', 'put']))
+ assert_equal({}, validate_command(sigdict, ['config-key', 'put',
+ 'key', 'value',
+ 'toomany']))
+
+ def test_del(self):
+ self.check_1_string_arg('config-key', 'del')
+
+ def test_exists(self):
+ self.check_1_string_arg('config-key', 'exists')
+
+ def test_list(self):
+ self.check_no_arg('config-key', 'list')
+# Local Variables:
+# compile-command: "cd ../.. ; make -j4 &&
+# PYTHONPATH=pybind nosetests --stop \
+# test/pybind/test_ceph_argparse.py # test_ceph_argparse.py:TestOSD.test_rm"
+# End:
diff --git a/src/tools/ceph-filestore-dump.cc b/src/tools/ceph-filestore-dump.cc
index 49b8d10bdba..b4220bae307 100644
--- a/src/tools/ceph-filestore-dump.cc
+++ b/src/tools/ceph-filestore-dump.cc
@@ -52,6 +52,32 @@ enum {
END_OF_TYPES, //Keep at the end
};
+//#define INTERNAL_TEST
+//#define INTERNAL_TEST2
+
+#ifdef INTERNAL_TEST
+CompatSet get_test_compat_set() {
+ CompatSet::FeatureSet ceph_osd_feature_compat;
+ CompatSet::FeatureSet ceph_osd_feature_ro_compat;
+ CompatSet::FeatureSet ceph_osd_feature_incompat;
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGINFO);
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_OLOC);
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEC);
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES);
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL);
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BIGINFO);
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO);
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG);
+#ifdef INTERNAL_TEST2
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER);
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
+#endif
+ return CompatSet(ceph_osd_feature_compat, ceph_osd_feature_ro_compat,
+ ceph_osd_feature_incompat);
+}
+#endif
+
typedef uint8_t sectiontype_t;
typedef uint32_t mymagic_t;
typedef int64_t mysize_t;
@@ -69,7 +95,7 @@ const int fd_none = INT_MIN;
//can be added to the export format.
struct super_header {
static const uint32_t super_magic = (shortmagic << 16) | shortmagic;
- static const uint32_t super_ver = 1;
+ static const uint32_t super_ver = 2;
static const uint32_t FIXED_LENGTH = 16;
uint32_t magic;
uint32_t version;
@@ -139,18 +165,25 @@ struct footer {
struct pg_begin {
pg_t pgid;
+ OSDSuperblock superblock;
- pg_begin(pg_t pg): pgid(pg) { }
+ pg_begin(pg_t pg, OSDSuperblock sb):
+ pgid(pg), superblock(sb) { }
pg_begin() { }
void encode(bufferlist& bl) const {
- ENCODE_START(1, 1, bl);
+ // New super_ver prevents decode from ver 1
+ ENCODE_START(2, 2, bl);
::encode(pgid, bl);
+ ::encode(superblock, bl);
ENCODE_FINISH(bl);
}
void decode(bufferlist::iterator& bl) {
- DECODE_START(1, bl);
+ DECODE_START(2, bl);
::decode(pgid, bl);
+ if (struct_v > 1) {
+ ::decode(superblock, bl);
+ }
DECODE_FINISH(bl);
}
};
@@ -347,8 +380,8 @@ void remove_coll(ObjectStore *store, const coll_t &coll)
OSD::make_snapmapper_oid());
SnapMapper mapper(&driver, 0, 0, 0);
- vector<hobject_t> objects;
- hobject_t next;
+ vector<ghobject_t> objects;
+ ghobject_t next;
int r = 0;
int64_t num = 0;
ObjectStore::Transaction *t = new ObjectStore::Transaction;
@@ -358,13 +391,14 @@ void remove_coll(ObjectStore *store, const coll_t &coll)
&objects, &next);
if (r < 0)
goto out;
- for (vector<hobject_t>::iterator i = objects.begin();
+ for (vector<ghobject_t>::iterator i = objects.begin();
i != objects.end();
++i, ++num) {
+ assert(i->generation == ghobject_t::NO_GEN);
OSDriver::OSTransaction _t(driver.get_transaction(t));
cout << "remove " << *i << std::endl;
- int r = mapper.remove_oid(*i, &_t);
+ int r = mapper.remove_oid(i->hobj, &_t);
if (r != 0 && r != -ENOENT) {
assert(0);
}
@@ -621,18 +655,19 @@ int export_file(ObjectStore *store, coll_t cid, hobject_t &obj)
int export_files(ObjectStore *store, coll_t coll)
{
- vector<hobject_t> objects;
- hobject_t next;
+ vector<ghobject_t> objects;
+ ghobject_t next;
while (!next.is_max()) {
int r = store->collection_list_partial(coll, next, 200, 300, 0,
&objects, &next);
if (r < 0)
return r;
- for (vector<hobject_t>::iterator i = objects.begin();
+ for (vector<ghobject_t>::iterator i = objects.begin();
i != objects.end();
++i) {
- r = export_file(store, coll, *i);
+ assert(i->generation == ghobject_t::NO_GEN);
+ r = export_file(store, coll, i->hobj);
if (r < 0)
return r;
}
@@ -664,7 +699,7 @@ void write_super()
}
int do_export(ObjectStore *fs, coll_t coll, pg_t pgid, pg_info_t &info,
- epoch_t map_epoch, __u8 struct_ver)
+ epoch_t map_epoch, __u8 struct_ver, OSDSuperblock superblock)
{
PGLog::IndexedLog log;
pg_missing_t missing;
@@ -675,7 +710,7 @@ int do_export(ObjectStore *fs, coll_t coll, pg_t pgid, pg_info_t &info,
write_super();
- pg_begin pgb(pgid);
+ pg_begin pgb(pgid, superblock);
ret = write_section(TYPE_PG_BEGIN, pgb, file_fd);
if (ret)
return ret;
@@ -909,7 +944,7 @@ int get_pg_metadata(ObjectStore *store, coll_t coll, bufferlist &bl)
return 0;
}
-int do_import(ObjectStore *store)
+int do_import(ObjectStore *store, OSDSuperblock sb)
{
bufferlist ebl;
pg_info_t info;
@@ -943,7 +978,16 @@ int do_import(ObjectStore *store)
pg_begin pgb;
pgb.decode(ebliter);
pg_t pgid = pgb.pgid;
-
+
+ if (debug) {
+ cout << "Exported features: " << pgb.superblock.compat_features << std::endl;
+ }
+ if (sb.compat_features.compare(pgb.superblock.compat_features) == -1) {
+ cout << "Export has incompatible features set "
+ << pgb.superblock.compat_features << std::endl;
+ return 1;
+ }
+
log_oid = OSD::make_pg_log_oid(pgid);
biginfo_oid = OSD::make_pg_biginfo_oid(pgid);
@@ -1017,7 +1061,7 @@ int main(int argc, char **argv)
("pgid", po::value<string>(&pgidstr),
"PG id, mandatory")
("type", po::value<string>(&type),
- "Type one of info, log, export, or import, mandatory")
+ "Type one of info, log, remove, export, or import, mandatory")
("file", po::value<string>(&file),
"path of file to export or import")
("debug", "Enable diagnostic output to stderr")
@@ -1170,14 +1214,67 @@ int main(int argc, char **argv)
return 1;
}
+ bool fs_sharded_objects = fs->get_allow_sharded_objects();
+
int ret = 0;
vector<coll_t> ls;
vector<coll_t>::iterator it;
+ CompatSet supported;
+
+#ifdef INTERNAL_TEST
+ supported = get_test_compat_set();
+#else
+ supported = OSD::get_osd_compat_set();
+#endif
+
+ bufferlist bl;
+ OSDSuperblock superblock;
+ bufferlist::iterator p;
+ ret = fs->read(coll_t::META_COLL, OSD_SUPERBLOCK_POBJECT, 0, 0, bl);
+ if (ret < 0) {
+ cout << "Failure to read OSD superblock error= " << r << std::endl;
+ goto out;
+ }
+
+ p = bl.begin();
+ ::decode(superblock, p);
+
+#ifdef INTERNAL_TEST2
+ fs->set_allow_sharded_objects();
+ assert(fs->get_allow_sharded_objects());
+ fs_sharded_objects = true;
+ superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
+#endif
+
+ if (debug && file_fd != STDOUT_FILENO) {
+ cout << "Supported features: " << supported << std::endl;
+ cout << "On-disk features: " << superblock.compat_features << std::endl;
+ }
+ if (supported.compare(superblock.compat_features) == -1) {
+ cout << "On-disk OSD incompatible features set "
+ << superblock.compat_features << std::endl;
+ ret = EINVAL;
+ goto out;
+ }
+
+ // If there was a crash as an OSD was transitioning to sharded objects
+ // and hadn't completed a set_allow_sharded_objects().
+ // This utility does not want to attempt to finish that transition.
+ if (superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_SHARDS) != fs_sharded_objects) {
+ // An OSD should never have call set_allow_sharded_objects() before
+ // updating its own OSD features.
+ if (fs_sharded_objects)
+ cout << "FileStore sharded but OSD not set, Corruption?" << std::endl;
+ else
+ cout << "Found incomplete transition to sharded objects" << std::endl;
+ ret = EINVAL;
+ goto out;
+ }
if (type == "import") {
try {
- ret = do_import(fs);
+ ret = do_import(fs, superblock);
}
catch (const buffer::error &e) {
cout << "do_import threw exception error " << e.what() << std::endl;
@@ -1260,7 +1357,7 @@ int main(int argc, char **argv)
cerr << "struct_v " << (int)struct_ver << std::endl;
if (type == "export") {
- ret = do_export(fs, coll, pgid, info, map_epoch, struct_ver);
+ ret = do_export(fs, coll, pgid, info, map_epoch, struct_ver, superblock);
} else if (type == "info") {
formatter->open_object_section("info");
info.dump(formatter);
diff --git a/src/tools/ceph-osdomap-tool.cc b/src/tools/ceph-osdomap-tool.cc
index aedc4c824e7..bde4b28b45f 100644
--- a/src/tools/ceph-osdomap-tool.cc
+++ b/src/tools/ceph-osdomap-tool.cc
@@ -115,30 +115,30 @@ int main(int argc, char **argv) {
i->value().hexdump(std::cout);
}
} else if (cmd == "dump-objects") {
- vector<hobject_t> objects;
+ vector<ghobject_t> objects;
r = omap.list_objects(&objects);
if (r < 0) {
std::cerr << "list_objects got: " << cpp_strerror(r) << std::endl;
goto done;
}
- for (vector<hobject_t>::iterator i = objects.begin();
+ for (vector<ghobject_t>::iterator i = objects.begin();
i != objects.end();
++i) {
std::cout << *i << std::endl;
}
r = 0;
} else if (cmd == "dump-objects-with-keys") {
- vector<hobject_t> objects;
+ vector<ghobject_t> objects;
r = omap.list_objects(&objects);
if (r < 0) {
std::cerr << "list_objects got: " << cpp_strerror(r) << std::endl;
goto done;
}
- for (vector<hobject_t>::iterator i = objects.begin();
+ for (vector<ghobject_t>::iterator i = objects.begin();
i != objects.end();
++i) {
std::cout << "Object: " << *i << std::endl;
- ObjectMap::ObjectMapIterator j = omap.get_iterator(*i);
+ ObjectMap::ObjectMapIterator j = omap.get_iterator(i->hobj);
for (j->seek_to_first(); j->valid(); j->next()) {
std::cout << j->key() << std::endl;
j->value().hexdump(std::cout);
diff --git a/src/tools/dupstore.cc b/src/tools/dupstore.cc
index e17eb2201a7..c8b8ece31c8 100644
--- a/src/tools/dupstore.cc
+++ b/src/tools/dupstore.cc
@@ -27,7 +27,7 @@ int dupstore(ObjectStore* src, ObjectStore* dst)
if (dst->mount() < 0) return 1;
// objects
- hash_map<hobject_t, coll_t> did_object;
+ hash_map<ghobject_t, coll_t> did_object;
// collections
vector<coll_t> collections;
@@ -54,11 +54,11 @@ int dupstore(ObjectStore* src, ObjectStore* dst)
dst->apply_transaction(t);
}
- vector<hobject_t> o;
+ vector<ghobject_t> o;
src->collection_list(*p, o);
int numo = o.size();
int j = 1;
- for (vector<hobject_t>::iterator q = o.begin(); q != o.end(); ++q) {
+ for (vector<ghobject_t>::iterator q = o.begin(); q != o.end(); ++q) {
ObjectStore::Transaction t;
if (did_object.count(*q))
t.collection_add(*p, did_object[*q], *q);
diff --git a/src/tools/rados/rados.cc b/src/tools/rados/rados.cc
index e8386959349..ad8eaa3e1a4 100644
--- a/src/tools/rados/rados.cc
+++ b/src/tools/rados/rados.cc
@@ -98,6 +98,7 @@ void usage(ostream& out)
" rmomapkey <obj-name> <key>\n"
" getomapheader <obj-name>\n"
" setomapheader <obj-name> <val>\n"
+" tmap-to-omap <obj-name> convert tmap keys/values to omap\n"
" listwatchers <obj-name> list the watchers of this object\n"
"\n"
"IMPORT AND EXPORT\n"
@@ -1813,8 +1814,15 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts,
bufferlist::iterator p = outdata.begin();
bufferlist header;
map<string, bufferlist> kv;
- ::decode(header, p);
- ::decode(kv, p);
+ try {
+ ::decode(header, p);
+ ::decode(kv, p);
+ }
+ catch (buffer::error& e) {
+ cerr << "error decoding tmap " << pool_name << "/" << oid << std::endl;
+ ret = -EINVAL;
+ goto out;
+ }
cout << "header (" << header.length() << " bytes):\n";
header.hexdump(cout);
cout << "\n";
@@ -1841,6 +1849,50 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts,
}
}
+ else if (strcmp(nargs[0], "tmap-to-omap") == 0) {
+ if (!pool_name || nargs.size() < 2)
+ usage_exit();
+ string oid(nargs[1]);
+
+ bufferlist bl;
+ int r = io_ctx.tmap_get(oid, bl);
+ if (r < 0) {
+ ret = r;
+ cerr << "error reading tmap " << pool_name << "/" << oid
+ << ": " << cpp_strerror(ret) << std::endl;
+ goto out;
+ }
+ bufferlist hdr;
+ map<string, bufferlist> kv;
+ bufferlist::iterator p = bl.begin();
+ try {
+ ::decode(hdr, p);
+ ::decode(kv, p);
+ }
+ catch (buffer::error& e) {
+ cerr << "error decoding tmap " << pool_name << "/" << oid << std::endl;
+ ret = -EINVAL;
+ goto out;
+ }
+ if (!p.end()) {
+ cerr << "error decoding tmap (stray trailing data) in " << pool_name << "/" << oid << std::endl;
+ ret = -EINVAL;
+ goto out;
+ }
+ librados::ObjectWriteOperation wr;
+ wr.omap_set_header(hdr);
+ wr.omap_set(kv);
+ wr.truncate(0); // delete the old tmap data
+ r = io_ctx.operate(oid, &wr);
+ if (r < 0) {
+ ret = r;
+ cerr << "error writing tmap data as omap on " << pool_name << "/" << oid
+ << ": " << cpp_strerror(ret) << std::endl;
+ goto out;
+ }
+ ret = 0;
+ }
+
else if (strcmp(nargs[0], "mkpool") == 0) {
int auid = 0;
__u8 crush_rule = 0;
@@ -2235,8 +2287,9 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts,
}
} else {
- cerr << "unrecognized command " << nargs[0] << std::endl;
- usage_exit();
+ cerr << "unrecognized command " << nargs[0] << "; -h or --help for usage" << std::endl;
+ ret = -EINVAL;
+ goto out;
}
if (ret < 0)
diff --git a/src/vstart.sh b/src/vstart.sh
index c112bfc9138..def480779de 100755
--- a/src/vstart.sh
+++ b/src/vstart.sh
@@ -249,11 +249,11 @@ if [ -n "$ip" ]; then
IP="$ip"
else
echo hostname $HOSTNAME
- RAW_IP=`hostname --ip-address`
+ RAW_IP=`hostname -I`
# filter out IPv6 and localhost addresses
IP="$(echo "$RAW_IP"|tr ' ' '\012'|grep -v :|grep -v '^127\.'|head -n1)"
# if that left nothing, then try to use the raw thing, it might work
- if [ -z "IP" ]; then IP="$RAW_IP"; fi
+ if [ -z "$IP" ]; then IP="$RAW_IP"; fi
echo ip $IP
fi
echo "ip $IP"
@@ -339,6 +339,7 @@ $DAEMONOPTS
$COSDDEBUG
$extra_conf
[mon]
+ mon pg warn min per osd = 10
$DAEMONOPTS
$CMONDEBUG
$extra_conf