From 48dadae23eb630a7c7abb33c1d2fcb002c8492b1 Mon Sep 17 00:00:00 2001 From: Noah Watkins Date: Sun, 21 Jul 2013 11:56:48 -0700 Subject: FileJournal: zero-fill in-lieu of posix_fallocate Zero-fill journal if posix_fallocate fails or if it is not supported. For very large journals zero fill can take a long time. An optimization is to write a zero byte to the end of each block, or use platform specific features for file allocation. Reference solutions for various platforms can be found in Mozilla, SQLite, and PostgreSQL. Signed-off-by: Noah Watkins --- configure.ac | 2 ++ src/os/FileJournal.cc | 37 +++++++++++++++++++++++++++---------- 2 files changed, 29 insertions(+), 10 deletions(-) diff --git a/configure.ac b/configure.ac index 8f6b17b86e5..1d628d0a1b2 100644 --- a/configure.ac +++ b/configure.ac @@ -538,6 +538,8 @@ AC_CHECK_MEMBER([struct stat.st_mtimespec.tv_nsec], AC_CHECK_FUNC([extattr_set_fd], [AC_DEFINE(HAVE_EXTATTR, 1, [Define if you have extattr_set_fd])]) +AC_CHECK_FUNCS([posix_fallocate]) + AC_CHECK_HEADERS([linux/fs.h]) AC_CHECK_HEADERS([sys/disk.h]) AC_CHECK_HEADERS([sys/prctl.h]) diff --git a/src/os/FileJournal.cc b/src/os/FileJournal.cc index 4a2af08dd4c..c58181fbb8b 100644 --- a/src/os/FileJournal.cc +++ b/src/os/FileJournal.cc @@ -12,6 +12,7 @@ * */ +#include "acconfig.h" #include "common/debug.h" #include "common/errno.h" #include "common/safe_io.h" @@ -277,6 +278,8 @@ int FileJournal::_open_file(int64_t oldsize, blksize_t blksize, bool create) { int ret; + bool zero_fill = false; + int64_t new_max_size; int64_t conf_journal_sz(g_conf->osd_journal_size); conf_journal_sz <<= 20; @@ -298,34 +301,46 @@ int FileJournal::_open_file(int64_t oldsize, blksize_t blksize, << newsize << " bytes: " << cpp_strerror(err) << dendl; return -err; } +#ifdef HAVE_POSIX_FALLOCATE ret = ::posix_fallocate(fd, 0, newsize); if (ret) { derr << "FileJournal::_open_file : unable to preallocation journal to " - << newsize << " bytes: " << cpp_strerror(ret) << dendl; - return -ret; + << newsize << " bytes: " << cpp_strerror(ret) + << " using zero-fill fallback" << dendl; + zero_fill = true; } - max_size = newsize; +#else + derr << "FileJournal::_open_file : posix_fallocate unavailable. " + << "zero filling " << newsize << " bytes" << dendl; + zero_fill = true; +#endif + new_max_size = newsize; } else { - max_size = oldsize; + new_max_size = oldsize; } - block_size = MAX(blksize, (blksize_t)CEPH_PAGE_SIZE); - if (create && g_conf->journal_zero_on_create) { + /* + * Journal zeroing is used as a fallback for posix_fallocate failure or + * if posix_fallocate isn't supported. A simple performance optimization + * here would be to only write a zero byte to the end of each block if full + * zeroing wasn't requested. + */ + if (zero_fill || (create && g_conf->journal_zero_on_create)) { derr << "FileJournal::_open_file : zeroing journal" << dendl; uint64_t write_size = 1 << 20; char *buf = new char[write_size]; memset(static_cast(buf), 0, write_size); uint64_t i = 0; - for (; (i + write_size) <= (unsigned)max_size; i += write_size) { + for (; (i + write_size) <= (unsigned)new_max_size; i += write_size) { ret = ::pwrite(fd, static_cast(buf), write_size, i); if (ret < 0) { delete [] buf; return -errno; } } - if (i < (unsigned)max_size) { - ret = ::pwrite(fd, static_cast(buf), max_size - i, i); + if (i < (unsigned)new_max_size) { + ret = ::pwrite(fd, static_cast(buf), new_max_size - i, i); if (ret < 0) { delete [] buf; return -errno; @@ -333,7 +348,9 @@ int FileJournal::_open_file(int64_t oldsize, blksize_t blksize, } delete [] buf; } - + + max_size = new_max_size; + block_size = MAX(blksize, (blksize_t)CEPH_PAGE_SIZE); dout(10) << "_open journal is not a block device, NOT checking disk " << "write cache on '" << fn << "'" << dendl; -- cgit v1.2.1