summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--doc/install/build-packages.rst2
-rw-r--r--doc/install/build-prerequisites.rst8
-rw-r--r--doc/install/building-ceph.rst15
-rw-r--r--doc/rados/configuration/filesystem-recommendations.rst7
-rw-r--r--doc/rados/operations/index.rst1
-rw-r--r--doc/rados/operations/memory-profiling.rst96
-rw-r--r--doc/rados/operations/pools.rst35
-rwxr-xr-xqa/workunits/suites/dbench-short.sh5
-rwxr-xr-xqa/workunits/suites/pjd.sh4
-rw-r--r--src/ceph_fuse.cc5
-rw-r--r--src/log/Log.cc2
-rw-r--r--src/os/FileJournal.cc75
-rw-r--r--src/test/test_filejournal.cc47
13 files changed, 244 insertions, 58 deletions
diff --git a/doc/install/build-packages.rst b/doc/install/build-packages.rst
index c8f405a76b1..2e5662099d2 100644
--- a/doc/install/build-packages.rst
+++ b/doc/install/build-packages.rst
@@ -19,7 +19,7 @@ To create ``.deb`` packages for Debian/Ubuntu, ensure that you have cloned the
sudo apt-get install debhelper
-Once you have installed debhelper, you can build the packages:
+Once you have installed debhelper, you can build the packages::
sudo dpkg-buildpackage
diff --git a/doc/install/build-prerequisites.rst b/doc/install/build-prerequisites.rst
index f68a68fc10b..143f6b4620d 100644
--- a/doc/install/build-prerequisites.rst
+++ b/doc/install/build-prerequisites.rst
@@ -38,8 +38,14 @@ installed on your host. ::
On Debian/Squeeze, execute ``aptitude install`` for each dependency that isn't
installed on your host. ::
- aptitude install autotools-dev autoconf automake cdbs gcc g++ git libboost-dev libedit-dev libssl-dev libtool libfcgi libfcgi-dev libfuse-dev linux-kernel-headers libcrypto++-dev libcrypto++ libexpat1-dev
+ aptitude install autotools-dev autoconf automake cdbs gcc g++ git libboost-dev libedit-dev libssl-dev libtool libfcgi libfcgi-dev libfuse-dev linux-kernel-headers libcrypto++-dev libcrypto++ libexpat1-dev pkg-config libcurl4-gnutls-dev
+
+On Debian/Wheezy, you may also need::
+ keyutils-dev libaio and libboost-thread-dev
+
+.. note:: Some distributions that support Google's memory profiler tool may use
+ a different package name (e.g., ``libgoogle-perftools4``).
Ubuntu
======
diff --git a/doc/install/building-ceph.rst b/doc/install/building-ceph.rst
index 6795c6f2591..e8b4b3551b7 100644
--- a/doc/install/building-ceph.rst
+++ b/doc/install/building-ceph.rst
@@ -11,16 +11,19 @@ following::
./configure
make
-You can use ``make -j`` to execute multiple jobs depending upon your system. For
-example::
+.. topic:: Hyperthreading
+
+ You can use ``make -j`` to execute multiple jobs depending upon your system. For
+ example, ``make -j4`` for a dual core processor may build faster.
- make -j4
To install Ceph locally, you may also use::
sudo make install
-If you install Ceph locally, ``make`` will place the executables in
-``usr/local/bin``. You may add the ``ceph.conf`` file to the ``usr/local/bin``
-directory to run an evaluation environment of Ceph from a single directory.
+If you install Ceph locally, ``make`` will place the executables in
+``usr/local/bin``. You may add the Ceph configuration file to the
+``usr/local/bin`` directory to run an evaluation environment of Ceph from a
+single directory.
+.. _Memory Profiling: ../../rados/operations/memory-profiling \ No newline at end of file
diff --git a/doc/rados/configuration/filesystem-recommendations.rst b/doc/rados/configuration/filesystem-recommendations.rst
index 17908cc3d19..cb1aca875ed 100644
--- a/doc/rados/configuration/filesystem-recommendations.rst
+++ b/doc/rados/configuration/filesystem-recommendations.rst
@@ -41,8 +41,11 @@ The underlying file system must provide sufficient capacity for
XATTRs. ``btrfs`` does not bound the total xattr metadata stored with
a file. ``XFS`` has a relatively large limit (64 KB) that most
deployments won't encounter, but the ``ext4`` is too small to be
-usable. To use these file systems, you should add the following like
-to the ``[osd]`` section of your ``ceph.conf`` file.::
+usable.
+
+You should always add the following line to the ``[osd]`` section of your
+``ceph.conf`` file for ``ext4`` filesystems; you can optionally use
+it for ``btrfs`` and ``XFS``.::
filestore xattr use omap = true
diff --git a/doc/rados/operations/index.rst b/doc/rados/operations/index.rst
index 032b567bc90..fca327a89b0 100644
--- a/doc/rados/operations/index.rst
+++ b/doc/rados/operations/index.rst
@@ -15,6 +15,7 @@ and, monitoring an operating cluster.
operating
monitoring
cpu-profiling
+ memory-profiling
troubleshooting
debug
diff --git a/doc/rados/operations/memory-profiling.rst b/doc/rados/operations/memory-profiling.rst
new file mode 100644
index 00000000000..dc46dcc888c
--- /dev/null
+++ b/doc/rados/operations/memory-profiling.rst
@@ -0,0 +1,96 @@
+==================
+ Memory Profiling
+==================
+
+Ceph OSD and metadata server daemons can generate heap profiles using
+``tcmalloc``. To generate heap profiles, ensure you have ``google-perftools``
+installed::
+
+ sudo apt-get google-perftools
+
+The profiler dumps output to your ``log file`` directory (i.e.,
+``/var/log/ceph``). See `Logging and Debugging Config Reference`_ for details.
+To view the profiler logs with Google's performance tools, execute the
+following::
+
+ google-pprof -gv {log-path/filename}
+
+Refer to `Google Heap Profiler`_ for additional details.
+
+Once you have the heap profiler installed, start your cluster and begin using
+the heap profiler. You may enable or disable the heap profiler at runtime, or
+ensure that it runs continously. For the following commandline usage, replace
+``{daemon-type}`` with ``osd`` or ``mds``, and replace ``daemon-id`` with the
+OSD number or metadata server letter.
+
+
+Starting the Profiler
+---------------------
+
+To start the heap profiler, execute the following::
+
+ ceph {daemon-type} tell {daemon-id} heap start_profiler
+
+For example::
+
+ ceph osd tell 1 heap start_profiler
+
+
+Printing Stats
+--------------
+
+To print out statistics, execute the following::
+
+ ceph {daemon-type} tell {daemon-id} heap stats
+
+For example::
+
+ ceph osd tell 0 heap stats
+
+.. note:: Printing stats does not require the profiler to be running and does
+ not dump the heap allocation information to a file.
+
+
+Dumping Heap Information
+------------------------
+
+To dump heap information, execute the following::
+
+ ceph {daemon-type} tell {daemon-id} heap dump
+
+For example::
+
+ ceph mds tell a heap dump
+
+.. note:: Dumping heap information only works when the profiler is running.
+
+
+Releasing Memory
+----------------
+
+To release memory that ``tcmalloc`` has allocated but which is not being used by
+the Ceph daemon itself, execute the following::
+
+ ceph {daemon-type} tell {daemon-id} heap release
+
+For example::
+
+ ceph osd tell 2 heap release
+
+
+Stopping the Profiler
+---------------------
+
+To stop the heap profiler, execute the following::
+
+ ceph {daemon-type} tell {daemon-id} heap stop_profiler
+
+For example::
+
+ ceph {daemon-type} tell {daemon-id} heap stop_profiler
+
+.. _Logging and Debugging Config Reference: ../../configuration/log-and-debug-ref
+.. _Google Heap Profiler: http://google-perftools.googlecode.com/svn/trunk/doc/heapprofile.html
+
+
+
diff --git a/doc/rados/operations/pools.rst b/doc/rados/operations/pools.rst
index 7a6bc6fc61d..a13fa91bcd8 100644
--- a/doc/rados/operations/pools.rst
+++ b/doc/rados/operations/pools.rst
@@ -64,21 +64,36 @@ Where:
``{pg-num}``
-:Description: The total number of placement groups for the pool
+:Description: The total number of placement groups for the pool. See `Placement
+ Groups`_ for details on calculating a suitable number. The
+ default value ``8`` is NOT suitable for most systems.
+
:Type: Integer
-:Required: No
+:Required: Yes
+:Default: 8
``{pgp-num}``
-:Description: The total number of placement groups for placement purposes.
-:Type: Integer
-:Required: No
-
-When you create a pool, you should consider setting the number of
-placement groups.
+:Description: The total number of placement groups for placement purposes. This
+ **should be equal to the total number of placement groups**, except
+ for placement group splitting scenarios.
-.. important:: You cannot change the number of placement groups in a pool
- after you create it.
+:Type: Integer
+:Required: Yes
+:Default: 8
+
+When you create a pool, set the number of placement groups to a reasonable value
+(e.g., ``100``). Consider the total number of placement groups per OSD too.
+Placement groups are computationally expensive, so performance will degrade when
+you have many pools with many placement groups (e.g., 50 pools with 100
+placement groups each). The point of diminishing returns depends upon the power
+of the OSD host.
+
+.. important:: Increasing the number of placement groups in a pool after you
+ create the pool is still an experimental feature in Bobtail (v 0.56). We
+ recommend defining a reasonable number of placement groups and maintaining
+ that number until Ceph's placement group splitting and merging
+ functionality matures.
See `Placement Groups`_ for details on calculating an appropriate number of
placement groups for your pool.
diff --git a/qa/workunits/suites/dbench-short.sh b/qa/workunits/suites/dbench-short.sh
new file mode 100755
index 00000000000..7297d835fb9
--- /dev/null
+++ b/qa/workunits/suites/dbench-short.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+set -e
+
+dbench 1
diff --git a/qa/workunits/suites/pjd.sh b/qa/workunits/suites/pjd.sh
index d82bd35478a..008ff4f8e50 100755
--- a/qa/workunits/suites/pjd.sh
+++ b/qa/workunits/suites/pjd.sh
@@ -2,7 +2,7 @@
set -e
-wget http://ceph.com/qa/pjd.tgz
+wget http://ceph.com/qa/pjd-fstest-20090130-RC-open24.tgz
tar zxvf pjd*.tgz
cd pjd*
make
@@ -12,5 +12,5 @@ cd tmp
# must be root!
sudo prove -r -v --exec 'bash -x' ../pjd*/tests
cd ..
-rm -r tmp pjd*
+rm -rf tmp pjd*
diff --git a/src/ceph_fuse.cc b/src/ceph_fuse.cc
index de513bcf0a8..f7a8d1a2c73 100644
--- a/src/ceph_fuse.cc
+++ b/src/ceph_fuse.cc
@@ -155,14 +155,15 @@ int main(int argc, const char **argv, const char *envp[]) {
}
r = cfuse.init(newargc, newargv);
- if (r < 0) {
+ if (r != 0) {
cerr << "ceph-fuse[" << getpid() << "]: fuse failed to initialize" << std::endl;
- goto out_shutdown;
+ goto out_client_unmount;
}
cerr << "ceph-fuse[" << getpid() << "]: starting fuse" << std::endl;
r = cfuse.loop();
cerr << "ceph-fuse[" << getpid() << "]: fuse finished with error " << r << std::endl;
+ out_client_unmount:
client->unmount();
//cout << "unmounted" << std::endl;
diff --git a/src/log/Log.cc b/src/log/Log.cc
index 2912463f6b6..e06afbfe1e2 100644
--- a/src/log/Log.cc
+++ b/src/log/Log.cc
@@ -252,7 +252,7 @@ void Log::_log_message(const char *s, bool crash)
void Log::dump_recent()
{
- pthread_mutex_unlock(&m_flush_mutex);
+ pthread_mutex_lock(&m_flush_mutex);
pthread_mutex_lock(&m_queue_mutex);
EntryQueue t;
diff --git a/src/os/FileJournal.cc b/src/os/FileJournal.cc
index 5f31406db40..20324f37a3d 100644
--- a/src/os/FileJournal.cc
+++ b/src/os/FileJournal.cc
@@ -1147,8 +1147,10 @@ void FileJournal::write_thread_entry()
}
assert(r == 0);
- logger->inc(l_os_j_wr);
- logger->inc(l_os_j_wr_bytes, bl.length());
+ if (logger) {
+ logger->inc(l_os_j_wr);
+ logger->inc(l_os_j_wr_bytes, bl.length());
+ }
#ifdef HAVE_LIBAIO
if (aio)
@@ -1249,40 +1251,51 @@ int FileJournal::write_aio_bl(off64_t& pos, bufferlist& bl, uint64_t seq)
dout(20) << "write_aio_bl " << pos << "~" << bl.length() << " seq " << seq << dendl;
- aio_queue.push_back(aio_info(bl, pos, seq));
- aio_info& aio = aio_queue.back();
+ while (bl.length() > 0) {
+ int max = MIN(bl.buffers().size(), IOV_MAX-1);
+ iovec *iov = new iovec[max];
+ int n = 0;
+ unsigned len = 0;
+ for (std::list<buffer::ptr>::const_iterator p = bl.buffers().begin();
+ n < max;
+ ++p, ++n) {
+ assert(p != bl.buffers().end());
+ iov[n].iov_base = (void *)p->c_str();
+ iov[n].iov_len = p->length();
+ len += p->length();
+ }
- aio.iov = new iovec[aio.bl.buffers().size()];
- int n = 0;
- for (std::list<buffer::ptr>::const_iterator p = aio.bl.buffers().begin();
- p != aio.bl.buffers().end();
- ++p, ++n) {
- aio.iov[n].iov_base = (void *)p->c_str();
- aio.iov[n].iov_len = p->length();
- }
- io_prep_pwritev(&aio.iocb, fd, aio.iov, n, pos);
+ bufferlist tbl;
+ bl.splice(0, len, &tbl); // move bytes from bl -> tbl
- dout(20) << "write_aio_bl .. " << aio.off << "~" << aio.len
- << " in " << n << dendl;
+ aio_queue.push_back(aio_info(tbl, pos, bl.length() > 0 ? 0 : seq));
+ aio_info& aio = aio_queue.back();
+ aio.iov = iov;
- aio_num++;
- aio_bytes += aio.len;
+ io_prep_pwritev(&aio.iocb, fd, aio.iov, n, pos);
- iocb *piocb = &aio.iocb;
- int attempts = 10;
- do {
- int r = io_submit(aio_ctx, 1, &piocb);
- if (r < 0) {
- derr << "io_submit to " << aio.off << "~" << aio.len
- << " got " << cpp_strerror(r) << dendl;
- if (r == -EAGAIN && attempts-- > 0) {
- usleep(500);
- continue;
+ dout(20) << "write_aio_bl .. " << aio.off << "~" << aio.len
+ << " in " << n << dendl;
+
+ aio_num++;
+ aio_bytes += aio.len;
+
+ iocb *piocb = &aio.iocb;
+ int attempts = 10;
+ do {
+ int r = io_submit(aio_ctx, 1, &piocb);
+ if (r < 0) {
+ derr << "io_submit to " << aio.off << "~" << aio.len
+ << " got " << cpp_strerror(r) << dendl;
+ if (r == -EAGAIN && attempts-- > 0) {
+ usleep(500);
+ continue;
+ }
+ assert(0 == "io_submit got unexpected error");
}
- assert(0 == "io_submit got unexpected error");
- }
- } while (false);
- pos += aio.len;
+ } while (false);
+ pos += aio.len;
+ }
write_finish_cond.Signal();
return 0;
}
diff --git a/src/test/test_filejournal.cc b/src/test/test_filejournal.cc
index c3af26bc3e4..5b7576dea39 100644
--- a/src/test/test_filejournal.cc
+++ b/src/test/test_filejournal.cc
@@ -1,5 +1,6 @@
#include <gtest/gtest.h>
#include <stdlib.h>
+#include <limits.h>
#include "common/ceph_argparse.h"
#include "common/common_init.h"
@@ -69,8 +70,13 @@ int main(int argc, char **argv) {
finisher = new Finisher(g_ceph_context);
- srand(getpid()+time(0));
- snprintf(path, sizeof(path), "/tmp/test_filejournal.tmp.%d", rand());
+ if (args.size()) {
+ strcpy(path, args[0]);
+ } else {
+ srand(getpid()+time(0));
+ snprintf(path, sizeof(path), "/tmp/test_filejournal.tmp.%d", rand());
+ }
+ cout << "path " << path << std::endl;
::testing::InitGoogleTest(&argc, argv);
@@ -160,6 +166,43 @@ TEST(TestFileJournal, WriteMany) {
j.close();
}
+TEST(TestFileJournal, WriteManyVecs) {
+ fsid.generate_random();
+ FileJournal j(fsid, finisher, &sync_cond, path, directio, aio);
+ ASSERT_EQ(0, j.create());
+ j.make_writeable();
+
+ C_GatherBuilder gb(g_ceph_context, new C_SafeCond(&lock, &cond, &done));
+
+ bufferlist first;
+ first.append("small");
+ j.submit_entry(1, first, 0, gb.new_sub());
+
+ bufferlist bl;
+ for (int i=0; i<IOV_MAX * 2; i++) {
+ bufferptr bp = buffer::create_page_aligned(4096);
+ memset(bp.c_str(), (char)i, 4096);
+ bl.append(bp);
+ }
+ bufferlist origbl = bl;
+ j.submit_entry(2, bl, 0, gb.new_sub());
+ gb.activate();
+ wait();
+
+ j.close();
+
+ j.open(1);
+ bufferlist inbl;
+ string v;
+ uint64_t seq = 0;
+ ASSERT_EQ(true, j.read_entry(inbl, seq));
+ ASSERT_EQ(seq, 2ull);
+ ASSERT_TRUE(inbl.contents_equal(origbl));
+ j.make_writeable();
+ j.close();
+
+}
+
TEST(TestFileJournal, ReplaySmall) {
fsid.generate_random();
FileJournal j(fsid, finisher, &sync_cond, path, directio, aio);