diff options
-rw-r--r-- | doc/install/build-packages.rst | 2 | ||||
-rw-r--r-- | doc/install/build-prerequisites.rst | 8 | ||||
-rw-r--r-- | doc/install/building-ceph.rst | 15 | ||||
-rw-r--r-- | doc/rados/configuration/filesystem-recommendations.rst | 7 | ||||
-rw-r--r-- | doc/rados/operations/index.rst | 1 | ||||
-rw-r--r-- | doc/rados/operations/memory-profiling.rst | 96 | ||||
-rw-r--r-- | doc/rados/operations/pools.rst | 35 | ||||
-rwxr-xr-x | qa/workunits/suites/dbench-short.sh | 5 | ||||
-rwxr-xr-x | qa/workunits/suites/pjd.sh | 4 | ||||
-rw-r--r-- | src/ceph_fuse.cc | 5 | ||||
-rw-r--r-- | src/log/Log.cc | 2 | ||||
-rw-r--r-- | src/os/FileJournal.cc | 75 | ||||
-rw-r--r-- | src/test/test_filejournal.cc | 47 |
13 files changed, 244 insertions, 58 deletions
diff --git a/doc/install/build-packages.rst b/doc/install/build-packages.rst index c8f405a76b1..2e5662099d2 100644 --- a/doc/install/build-packages.rst +++ b/doc/install/build-packages.rst @@ -19,7 +19,7 @@ To create ``.deb`` packages for Debian/Ubuntu, ensure that you have cloned the sudo apt-get install debhelper -Once you have installed debhelper, you can build the packages: +Once you have installed debhelper, you can build the packages:: sudo dpkg-buildpackage diff --git a/doc/install/build-prerequisites.rst b/doc/install/build-prerequisites.rst index f68a68fc10b..143f6b4620d 100644 --- a/doc/install/build-prerequisites.rst +++ b/doc/install/build-prerequisites.rst @@ -38,8 +38,14 @@ installed on your host. :: On Debian/Squeeze, execute ``aptitude install`` for each dependency that isn't installed on your host. :: - aptitude install autotools-dev autoconf automake cdbs gcc g++ git libboost-dev libedit-dev libssl-dev libtool libfcgi libfcgi-dev libfuse-dev linux-kernel-headers libcrypto++-dev libcrypto++ libexpat1-dev + aptitude install autotools-dev autoconf automake cdbs gcc g++ git libboost-dev libedit-dev libssl-dev libtool libfcgi libfcgi-dev libfuse-dev linux-kernel-headers libcrypto++-dev libcrypto++ libexpat1-dev pkg-config libcurl4-gnutls-dev + +On Debian/Wheezy, you may also need:: + keyutils-dev libaio and libboost-thread-dev + +.. note:: Some distributions that support Google's memory profiler tool may use + a different package name (e.g., ``libgoogle-perftools4``). Ubuntu ====== diff --git a/doc/install/building-ceph.rst b/doc/install/building-ceph.rst index 6795c6f2591..e8b4b3551b7 100644 --- a/doc/install/building-ceph.rst +++ b/doc/install/building-ceph.rst @@ -11,16 +11,19 @@ following:: ./configure make -You can use ``make -j`` to execute multiple jobs depending upon your system. For -example:: +.. topic:: Hyperthreading + + You can use ``make -j`` to execute multiple jobs depending upon your system. For + example, ``make -j4`` for a dual core processor may build faster. - make -j4 To install Ceph locally, you may also use:: sudo make install -If you install Ceph locally, ``make`` will place the executables in -``usr/local/bin``. You may add the ``ceph.conf`` file to the ``usr/local/bin`` -directory to run an evaluation environment of Ceph from a single directory. +If you install Ceph locally, ``make`` will place the executables in +``usr/local/bin``. You may add the Ceph configuration file to the +``usr/local/bin`` directory to run an evaluation environment of Ceph from a +single directory. +.. _Memory Profiling: ../../rados/operations/memory-profiling
\ No newline at end of file diff --git a/doc/rados/configuration/filesystem-recommendations.rst b/doc/rados/configuration/filesystem-recommendations.rst index 17908cc3d19..cb1aca875ed 100644 --- a/doc/rados/configuration/filesystem-recommendations.rst +++ b/doc/rados/configuration/filesystem-recommendations.rst @@ -41,8 +41,11 @@ The underlying file system must provide sufficient capacity for XATTRs. ``btrfs`` does not bound the total xattr metadata stored with a file. ``XFS`` has a relatively large limit (64 KB) that most deployments won't encounter, but the ``ext4`` is too small to be -usable. To use these file systems, you should add the following like -to the ``[osd]`` section of your ``ceph.conf`` file.:: +usable. + +You should always add the following line to the ``[osd]`` section of your +``ceph.conf`` file for ``ext4`` filesystems; you can optionally use +it for ``btrfs`` and ``XFS``.:: filestore xattr use omap = true diff --git a/doc/rados/operations/index.rst b/doc/rados/operations/index.rst index 032b567bc90..fca327a89b0 100644 --- a/doc/rados/operations/index.rst +++ b/doc/rados/operations/index.rst @@ -15,6 +15,7 @@ and, monitoring an operating cluster. operating monitoring cpu-profiling + memory-profiling troubleshooting debug diff --git a/doc/rados/operations/memory-profiling.rst b/doc/rados/operations/memory-profiling.rst new file mode 100644 index 00000000000..dc46dcc888c --- /dev/null +++ b/doc/rados/operations/memory-profiling.rst @@ -0,0 +1,96 @@ +================== + Memory Profiling +================== + +Ceph OSD and metadata server daemons can generate heap profiles using +``tcmalloc``. To generate heap profiles, ensure you have ``google-perftools`` +installed:: + + sudo apt-get google-perftools + +The profiler dumps output to your ``log file`` directory (i.e., +``/var/log/ceph``). See `Logging and Debugging Config Reference`_ for details. +To view the profiler logs with Google's performance tools, execute the +following:: + + google-pprof -gv {log-path/filename} + +Refer to `Google Heap Profiler`_ for additional details. + +Once you have the heap profiler installed, start your cluster and begin using +the heap profiler. You may enable or disable the heap profiler at runtime, or +ensure that it runs continously. For the following commandline usage, replace +``{daemon-type}`` with ``osd`` or ``mds``, and replace ``daemon-id`` with the +OSD number or metadata server letter. + + +Starting the Profiler +--------------------- + +To start the heap profiler, execute the following:: + + ceph {daemon-type} tell {daemon-id} heap start_profiler + +For example:: + + ceph osd tell 1 heap start_profiler + + +Printing Stats +-------------- + +To print out statistics, execute the following:: + + ceph {daemon-type} tell {daemon-id} heap stats + +For example:: + + ceph osd tell 0 heap stats + +.. note:: Printing stats does not require the profiler to be running and does + not dump the heap allocation information to a file. + + +Dumping Heap Information +------------------------ + +To dump heap information, execute the following:: + + ceph {daemon-type} tell {daemon-id} heap dump + +For example:: + + ceph mds tell a heap dump + +.. note:: Dumping heap information only works when the profiler is running. + + +Releasing Memory +---------------- + +To release memory that ``tcmalloc`` has allocated but which is not being used by +the Ceph daemon itself, execute the following:: + + ceph {daemon-type} tell {daemon-id} heap release + +For example:: + + ceph osd tell 2 heap release + + +Stopping the Profiler +--------------------- + +To stop the heap profiler, execute the following:: + + ceph {daemon-type} tell {daemon-id} heap stop_profiler + +For example:: + + ceph {daemon-type} tell {daemon-id} heap stop_profiler + +.. _Logging and Debugging Config Reference: ../../configuration/log-and-debug-ref +.. _Google Heap Profiler: http://google-perftools.googlecode.com/svn/trunk/doc/heapprofile.html + + + diff --git a/doc/rados/operations/pools.rst b/doc/rados/operations/pools.rst index 7a6bc6fc61d..a13fa91bcd8 100644 --- a/doc/rados/operations/pools.rst +++ b/doc/rados/operations/pools.rst @@ -64,21 +64,36 @@ Where: ``{pg-num}`` -:Description: The total number of placement groups for the pool +:Description: The total number of placement groups for the pool. See `Placement + Groups`_ for details on calculating a suitable number. The + default value ``8`` is NOT suitable for most systems. + :Type: Integer -:Required: No +:Required: Yes +:Default: 8 ``{pgp-num}`` -:Description: The total number of placement groups for placement purposes. -:Type: Integer -:Required: No - -When you create a pool, you should consider setting the number of -placement groups. +:Description: The total number of placement groups for placement purposes. This + **should be equal to the total number of placement groups**, except + for placement group splitting scenarios. -.. important:: You cannot change the number of placement groups in a pool - after you create it. +:Type: Integer +:Required: Yes +:Default: 8 + +When you create a pool, set the number of placement groups to a reasonable value +(e.g., ``100``). Consider the total number of placement groups per OSD too. +Placement groups are computationally expensive, so performance will degrade when +you have many pools with many placement groups (e.g., 50 pools with 100 +placement groups each). The point of diminishing returns depends upon the power +of the OSD host. + +.. important:: Increasing the number of placement groups in a pool after you + create the pool is still an experimental feature in Bobtail (v 0.56). We + recommend defining a reasonable number of placement groups and maintaining + that number until Ceph's placement group splitting and merging + functionality matures. See `Placement Groups`_ for details on calculating an appropriate number of placement groups for your pool. diff --git a/qa/workunits/suites/dbench-short.sh b/qa/workunits/suites/dbench-short.sh new file mode 100755 index 00000000000..7297d835fb9 --- /dev/null +++ b/qa/workunits/suites/dbench-short.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +set -e + +dbench 1 diff --git a/qa/workunits/suites/pjd.sh b/qa/workunits/suites/pjd.sh index d82bd35478a..008ff4f8e50 100755 --- a/qa/workunits/suites/pjd.sh +++ b/qa/workunits/suites/pjd.sh @@ -2,7 +2,7 @@ set -e -wget http://ceph.com/qa/pjd.tgz +wget http://ceph.com/qa/pjd-fstest-20090130-RC-open24.tgz tar zxvf pjd*.tgz cd pjd* make @@ -12,5 +12,5 @@ cd tmp # must be root! sudo prove -r -v --exec 'bash -x' ../pjd*/tests cd .. -rm -r tmp pjd* +rm -rf tmp pjd* diff --git a/src/ceph_fuse.cc b/src/ceph_fuse.cc index de513bcf0a8..f7a8d1a2c73 100644 --- a/src/ceph_fuse.cc +++ b/src/ceph_fuse.cc @@ -155,14 +155,15 @@ int main(int argc, const char **argv, const char *envp[]) { } r = cfuse.init(newargc, newargv); - if (r < 0) { + if (r != 0) { cerr << "ceph-fuse[" << getpid() << "]: fuse failed to initialize" << std::endl; - goto out_shutdown; + goto out_client_unmount; } cerr << "ceph-fuse[" << getpid() << "]: starting fuse" << std::endl; r = cfuse.loop(); cerr << "ceph-fuse[" << getpid() << "]: fuse finished with error " << r << std::endl; + out_client_unmount: client->unmount(); //cout << "unmounted" << std::endl; diff --git a/src/log/Log.cc b/src/log/Log.cc index 2912463f6b6..e06afbfe1e2 100644 --- a/src/log/Log.cc +++ b/src/log/Log.cc @@ -252,7 +252,7 @@ void Log::_log_message(const char *s, bool crash) void Log::dump_recent() { - pthread_mutex_unlock(&m_flush_mutex); + pthread_mutex_lock(&m_flush_mutex); pthread_mutex_lock(&m_queue_mutex); EntryQueue t; diff --git a/src/os/FileJournal.cc b/src/os/FileJournal.cc index 5f31406db40..20324f37a3d 100644 --- a/src/os/FileJournal.cc +++ b/src/os/FileJournal.cc @@ -1147,8 +1147,10 @@ void FileJournal::write_thread_entry() } assert(r == 0); - logger->inc(l_os_j_wr); - logger->inc(l_os_j_wr_bytes, bl.length()); + if (logger) { + logger->inc(l_os_j_wr); + logger->inc(l_os_j_wr_bytes, bl.length()); + } #ifdef HAVE_LIBAIO if (aio) @@ -1249,40 +1251,51 @@ int FileJournal::write_aio_bl(off64_t& pos, bufferlist& bl, uint64_t seq) dout(20) << "write_aio_bl " << pos << "~" << bl.length() << " seq " << seq << dendl; - aio_queue.push_back(aio_info(bl, pos, seq)); - aio_info& aio = aio_queue.back(); + while (bl.length() > 0) { + int max = MIN(bl.buffers().size(), IOV_MAX-1); + iovec *iov = new iovec[max]; + int n = 0; + unsigned len = 0; + for (std::list<buffer::ptr>::const_iterator p = bl.buffers().begin(); + n < max; + ++p, ++n) { + assert(p != bl.buffers().end()); + iov[n].iov_base = (void *)p->c_str(); + iov[n].iov_len = p->length(); + len += p->length(); + } - aio.iov = new iovec[aio.bl.buffers().size()]; - int n = 0; - for (std::list<buffer::ptr>::const_iterator p = aio.bl.buffers().begin(); - p != aio.bl.buffers().end(); - ++p, ++n) { - aio.iov[n].iov_base = (void *)p->c_str(); - aio.iov[n].iov_len = p->length(); - } - io_prep_pwritev(&aio.iocb, fd, aio.iov, n, pos); + bufferlist tbl; + bl.splice(0, len, &tbl); // move bytes from bl -> tbl - dout(20) << "write_aio_bl .. " << aio.off << "~" << aio.len - << " in " << n << dendl; + aio_queue.push_back(aio_info(tbl, pos, bl.length() > 0 ? 0 : seq)); + aio_info& aio = aio_queue.back(); + aio.iov = iov; - aio_num++; - aio_bytes += aio.len; + io_prep_pwritev(&aio.iocb, fd, aio.iov, n, pos); - iocb *piocb = &aio.iocb; - int attempts = 10; - do { - int r = io_submit(aio_ctx, 1, &piocb); - if (r < 0) { - derr << "io_submit to " << aio.off << "~" << aio.len - << " got " << cpp_strerror(r) << dendl; - if (r == -EAGAIN && attempts-- > 0) { - usleep(500); - continue; + dout(20) << "write_aio_bl .. " << aio.off << "~" << aio.len + << " in " << n << dendl; + + aio_num++; + aio_bytes += aio.len; + + iocb *piocb = &aio.iocb; + int attempts = 10; + do { + int r = io_submit(aio_ctx, 1, &piocb); + if (r < 0) { + derr << "io_submit to " << aio.off << "~" << aio.len + << " got " << cpp_strerror(r) << dendl; + if (r == -EAGAIN && attempts-- > 0) { + usleep(500); + continue; + } + assert(0 == "io_submit got unexpected error"); } - assert(0 == "io_submit got unexpected error"); - } - } while (false); - pos += aio.len; + } while (false); + pos += aio.len; + } write_finish_cond.Signal(); return 0; } diff --git a/src/test/test_filejournal.cc b/src/test/test_filejournal.cc index c3af26bc3e4..5b7576dea39 100644 --- a/src/test/test_filejournal.cc +++ b/src/test/test_filejournal.cc @@ -1,5 +1,6 @@ #include <gtest/gtest.h> #include <stdlib.h> +#include <limits.h> #include "common/ceph_argparse.h" #include "common/common_init.h" @@ -69,8 +70,13 @@ int main(int argc, char **argv) { finisher = new Finisher(g_ceph_context); - srand(getpid()+time(0)); - snprintf(path, sizeof(path), "/tmp/test_filejournal.tmp.%d", rand()); + if (args.size()) { + strcpy(path, args[0]); + } else { + srand(getpid()+time(0)); + snprintf(path, sizeof(path), "/tmp/test_filejournal.tmp.%d", rand()); + } + cout << "path " << path << std::endl; ::testing::InitGoogleTest(&argc, argv); @@ -160,6 +166,43 @@ TEST(TestFileJournal, WriteMany) { j.close(); } +TEST(TestFileJournal, WriteManyVecs) { + fsid.generate_random(); + FileJournal j(fsid, finisher, &sync_cond, path, directio, aio); + ASSERT_EQ(0, j.create()); + j.make_writeable(); + + C_GatherBuilder gb(g_ceph_context, new C_SafeCond(&lock, &cond, &done)); + + bufferlist first; + first.append("small"); + j.submit_entry(1, first, 0, gb.new_sub()); + + bufferlist bl; + for (int i=0; i<IOV_MAX * 2; i++) { + bufferptr bp = buffer::create_page_aligned(4096); + memset(bp.c_str(), (char)i, 4096); + bl.append(bp); + } + bufferlist origbl = bl; + j.submit_entry(2, bl, 0, gb.new_sub()); + gb.activate(); + wait(); + + j.close(); + + j.open(1); + bufferlist inbl; + string v; + uint64_t seq = 0; + ASSERT_EQ(true, j.read_entry(inbl, seq)); + ASSERT_EQ(seq, 2ull); + ASSERT_TRUE(inbl.contents_equal(origbl)); + j.make_writeable(); + j.close(); + +} + TEST(TestFileJournal, ReplaySmall) { fsid.generate_random(); FileJournal j(fsid, finisher, &sync_cond, path, directio, aio); |