summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--configure.ac18
-rw-r--r--doc/radosgw/s3/php.rst7
-rw-r--r--doc/rbd/rados-rbd-cmds.rst7
-rw-r--r--src/.gitignore1
-rw-r--r--src/Makefile.am5
-rw-r--r--src/common/config_opts.h1
-rw-r--r--src/crush/CrushWrapper.cc37
-rw-r--r--src/crush/CrushWrapper.h8
-rw-r--r--src/include/cephfs/libcephfs.h2
-rw-r--r--src/include/utime.h2
-rw-r--r--src/java/java/com/ceph/fs/CephMount.java51
-rw-r--r--src/java/native/libcephfs_jni.cc177
-rw-r--r--src/java/test/com/ceph/fs/CephMountTest.java83
-rw-r--r--src/java/test/com/ceph/fs/CephUnmountedTest.java16
-rw-r--r--src/mon/Elector.cc2
-rw-r--r--src/mon/Monitor.cc116
-rw-r--r--src/mon/Monitor.h8
-rw-r--r--src/mon/OSDMonitor.cc16
-rw-r--r--src/osd/OSDMap.cc62
-rw-r--r--src/osd/OSDMap.h6
-rw-r--r--src/osd/osd_types.cc2
-rwxr-xr-xsrc/ps-ceph.pl1
-rw-r--r--src/rbd.cc36
-rw-r--r--src/rbd_fuse/rbd-fuse.c752
-rw-r--r--src/rgw/rgw_rest_s3.cc2
-rw-r--r--src/test/cli/osdmaptool/clobber.t12
-rw-r--r--src/test/cli/osdmaptool/create-print.t6
27 files changed, 1353 insertions, 83 deletions
diff --git a/configure.ac b/configure.ac
index b67e5cd39c3..cb49a0b2fe1 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,6 +1,5 @@
# -*- Autoconf -*-
# Process this file with autoconf to produce a configure script.
-m4_include(m4/acx_pthread.m4)
# Autoconf
AC_PREREQ(2.59)
@@ -12,8 +11,15 @@ AC_PREREQ(2.59)
AC_INIT([ceph], [0.56], [ceph-devel@vger.kernel.org])
# Create release string. Used with VERSION for RPMs.
+RPM_RELEASE=0
AC_SUBST(RPM_RELEASE)
-RPM_RELEASE=`if expr index $(git describe --always) '-' > /dev/null ; then git describe --always | cut -d- -f2- | tr '-' '.' ; else echo "0"; fi`
+if test -d ".git" ; then
+ AC_CHECK_PROG(GIT_CHECK, git, yes)
+ if test x"$GIT_CHECK" = x"yes"; then
+ RPM_RELEASE=`if expr index $(git describe --always) '-' > /dev/null ; then git describe --always | cut -d- -f2- | tr '-' '.' ; else echo "0"; fi`
+ fi
+fi
+AC_MSG_NOTICE([RPM_RELEASE='$RPM_RELEASE'])
AC_CONFIG_MACRO_DIR([m4])
@@ -235,6 +241,8 @@ AS_IF([test "x$with_fuse" != xno],
AC_DEFINE([HAVE_LIBFUSE], [1],
[Define if you have fuse])
HAVE_LIBFUSE=1
+ # look for fuse_getgroups and define FUSE_GETGROUPS if found
+ AC_CHECK_FUNCS([fuse_getgroups])
],
[AC_MSG_FAILURE(
[no FUSE found (use --without-fuse to disable)])])])
@@ -385,7 +393,8 @@ AS_IF([test "x$with_libatomic_ops" != xno],
])])
AS_IF([test "$HAVE_ATOMIC_OPS" = "1"],
[],
- AC_DEFINE([NO_ATOMIC_OPS], [1], [Defined if you don't have atomic_ops]))
+ [AC_DEFINE([NO_ATOMIC_OPS], [1], [Defined if you do not have atomic_ops])])
+
AM_CONDITIONAL(WITH_LIBATOMIC, [test "$HAVE_ATOMIC_OPS" = "1"])
# newsyn? requires mpi.
@@ -411,9 +420,6 @@ AS_IF([test "x$with_system_leveldb" = xcheck],
[AC_CHECK_LIB([leveldb], [leveldb_open], [with_system_leveldb=yes], [], [-lsnappy -lpthread])])
AM_CONDITIONAL(WITH_SYSTEM_LEVELDB, [ test "$with_system_leveldb" = "yes" ])
-# look for fuse_getgroups and define FUSE_GETGROUPS if found
-AC_CHECK_FUNCS([fuse_getgroups])
-
# use system libs3?
AC_ARG_WITH([system-libs3],
[AS_HELP_STRING([--with-system-libs3], [use system libs3])],
diff --git a/doc/radosgw/s3/php.rst b/doc/radosgw/s3/php.rst
index fc3087e54d4..40542e072a2 100644
--- a/doc/radosgw/s3/php.rst
+++ b/doc/radosgw/s3/php.rst
@@ -21,7 +21,12 @@ This creates a connection so that you can interact with the server.
require_once 'AWSSDKforPHP/sdk.class.php';
// Instantiate the S3 class and point it at the desired host
- $Connection = new AmazonS3();
+ $Connection = new AmazonS3(array(
+ 'key' => AWS_KEY,
+ 'secret' => AWS_SECRET_KEY,
+ 'canonical_id' => AWS_CANONICAL_ID,
+ 'canonical_name' => AWS_CANONICAL_NAME,
+ ));
$Connection->set_hostname($HOST);
$Connection->allow_hostname_override(false);
diff --git a/doc/rbd/rados-rbd-cmds.rst b/doc/rbd/rados-rbd-cmds.rst
index 6e28a6ab713..eec58512edc 100644
--- a/doc/rbd/rados-rbd-cmds.rst
+++ b/doc/rbd/rados-rbd-cmds.rst
@@ -17,13 +17,13 @@ Before you can add a block device to a Ceph client, you must create an image for
it in the OSD cluster first. To create a block device image, execute the
following::
- rbd create {image-name} --size {megabytes} --dest-pool {pool-name}
+ rbd create {image-name} --size {megabytes} --pool {pool-name}
For example, to create a 1GB image named ``foo`` that stores information in a
pool named ``swimmingpool``, execute the following::
rbd create foo --size 1024
- rbd create bar --size 1024 --pool swimmingpool
+ rbd create bar --size 1024 --pool swimmingpool
.. note:: You must create a pool first before you can specify it as a
source. See `Storage Pools`_ for details.
@@ -99,8 +99,7 @@ For example::
rbd rm bar -p swimmingpool
-
.. _Storage Pools: ../../rados/operations/pools
-.. _RBD – Manage RADOS Block Device (RBD) Images: ../../man/8/rbd/ \ No newline at end of file
+.. _RBD – Manage RADOS Block Device (RBD) Images: ../../man/8/rbd/
diff --git a/src/.gitignore b/src/.gitignore
index 7548b5e47ae..d3cab1a4d1f 100644
--- a/src/.gitignore
+++ b/src/.gitignore
@@ -54,6 +54,7 @@
/fsconverter
/xattr_bench
/rest-bench
+/rbd-fuse
dev
mondata
mnt
diff --git a/src/Makefile.am b/src/Makefile.am
index 82d585c4507..c30c0c1a705 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -161,6 +161,11 @@ ceph_fuse_LDADD = -lfuse libclient.la $(LIBGLOBAL_LDA)
ceph_fuse_CXXFLAGS = ${AM_CXXFLAGS}
bin_PROGRAMS += ceph-fuse
+rbd_fuse_SOURCES = rbd_fuse/rbd-fuse.c
+rbd_fuse_LDADD = -lfuse librados.la librbd.la $(LIBGLOBAL_LDA)
+rbd_fuse_CXXFLAGS = ${AM_CXXFLAGS}
+bin_PROGRAMS += rbd-fuse
+
endif
# tcmalloc?
diff --git a/src/common/config_opts.h b/src/common/config_opts.h
index 59caca5a6a2..a778268d51a 100644
--- a/src/common/config_opts.h
+++ b/src/common/config_opts.h
@@ -127,6 +127,7 @@ OPTION(mon_osd_auto_mark_in, OPT_BOOL, false) // mark any booting osds '
OPTION(mon_osd_auto_mark_auto_out_in, OPT_BOOL, true) // mark booting auto-marked-out osds 'in'
OPTION(mon_osd_auto_mark_new_in, OPT_BOOL, true) // mark booting new osds 'in'
OPTION(mon_osd_down_out_interval, OPT_INT, 300) // seconds
+OPTION(mon_osd_down_out_subtree_limit, OPT_STR, "rack") // largest crush unit/type that we will automatically mark out
OPTION(mon_osd_min_up_ratio, OPT_DOUBLE, .3) // min osds required to be up to mark things down
OPTION(mon_osd_min_in_ratio, OPT_DOUBLE, .3) // min osds required to be in to mark things out
OPTION(mon_lease, OPT_FLOAT, 5) // lease interval
diff --git a/src/crush/CrushWrapper.cc b/src/crush/CrushWrapper.cc
index 3bae96c8689..45e4fb53de6 100644
--- a/src/crush/CrushWrapper.cc
+++ b/src/crush/CrushWrapper.cc
@@ -202,6 +202,23 @@ map<int, string> CrushWrapper::get_parent_hierarchy(int id)
return parent_hierarchy;
}
+int CrushWrapper::get_children(int id, list<int> *children)
+{
+ // leaf?
+ if (id >= 0) {
+ return 0;
+ }
+
+ crush_bucket *b = get_bucket(id);
+ if (!b) {
+ return -ENOENT;
+ }
+
+ for (unsigned n=0; n<b->size; n++) {
+ children->push_back(b->items[n]);
+ }
+ return b->size;
+}
int CrushWrapper::insert_item(CephContext *cct, int item, float weight, string name,
@@ -426,24 +443,36 @@ pair<string,string> CrushWrapper::get_immediate_parent(int id)
{
pair <string, string> loc;
-
for (int bidx = 0; bidx < crush->max_buckets; bidx++) {
crush_bucket *b = crush->buckets[bidx];
if (b == 0)
continue;
for (unsigned i = 0; i < b->size; i++)
- if (b->items[i] == id){
+ if (b->items[i] == id) {
string parent_id = name_map[b->id];
string parent_bucket_type = type_map[b->type];
loc = make_pair(parent_bucket_type, parent_id);
}
}
-
return loc;
}
-
+int CrushWrapper::get_immediate_parent_id(int id, int *parent)
+{
+ for (int bidx = 0; bidx < crush->max_buckets; bidx++) {
+ crush_bucket *b = crush->buckets[bidx];
+ if (b == 0)
+ continue;
+ for (unsigned i = 0; i < b->size; i++) {
+ if (b->items[i] == id) {
+ *parent = b->id;
+ return 0;
+ }
+ }
+ }
+ return -ENOENT;
+}
void CrushWrapper::reweight(CephContext *cct)
{
diff --git a/src/crush/CrushWrapper.h b/src/crush/CrushWrapper.h
index 56bcb598ff3..7def6e4ab34 100644
--- a/src/crush/CrushWrapper.h
+++ b/src/crush/CrushWrapper.h
@@ -284,6 +284,7 @@ public:
* returns the (type, name) of the parent bucket of id
*/
pair<string,string> get_immediate_parent(int id);
+ int get_immediate_parent_id(int id, int *parent);
/**
* get the fully qualified location of a device by successively finding
@@ -302,6 +303,13 @@ public:
*/
map<int, string> get_parent_hierarchy(int id);
+ /**
+ * enumerate immediate children of given node
+ *
+ * @param id parent bucket or device id
+ * @return number of items, or error
+ */
+ int get_children(int id, list<int> *children);
/**
* insert an item into the map at a specific position
diff --git a/src/include/cephfs/libcephfs.h b/src/include/cephfs/libcephfs.h
index 63e9233d9da..7b04cce4270 100644
--- a/src/include/cephfs/libcephfs.h
+++ b/src/include/cephfs/libcephfs.h
@@ -851,7 +851,7 @@ int ceph_get_file_pool(struct ceph_mount_info *cmount, int fh);
* @param fh the open file descriptor referring to the file
* @param buf buffer to store the name in
* @param buflen size of the buffer
- * @returns length in bytes of the pool name
+ * @returns length in bytes of the pool name, or -ERANGE if the buffer is not large enough.
*/
int ceph_get_file_pool_name(struct ceph_mount_info *cmount, int fh, char *buf, size_t buflen);
diff --git a/src/include/utime.h b/src/include/utime.h
index 526dec568ae..f433fff4467 100644
--- a/src/include/utime.h
+++ b/src/include/utime.h
@@ -136,7 +136,7 @@ public:
}
void sleep() {
- struct timespec ts = { tv.tv_sec, tv.tv_nsec };
+ struct timespec ts = { (__time_t)tv.tv_sec, (long)tv.tv_nsec };
nanosleep(&ts, &ts);
}
diff --git a/src/java/java/com/ceph/fs/CephMount.java b/src/java/java/com/ceph/fs/CephMount.java
index 10036f6c768..6051cf635d0 100644
--- a/src/java/java/com/ceph/fs/CephMount.java
+++ b/src/java/java/com/ceph/fs/CephMount.java
@@ -432,6 +432,23 @@ public class CephMount {
* @param path Path of file to stat.
* @param stat CephStat structure to hold file status.
*/
+ public void stat(String path, CephStat stat) throws FileNotFoundException, CephNotDirectoryException {
+ rlock.lock();
+ try {
+ native_ceph_stat(instance_ptr, path, stat);
+ } finally {
+ rlock.unlock();
+ }
+ }
+
+ private static native int native_ceph_stat(long mountp, String path, CephStat stat);
+
+ /**
+ * Get file status, without following symlinks.
+ *
+ * @param path Path of file to stat.
+ * @param stat CephStat structure to hold file status.
+ */
public void lstat(String path, CephStat stat) throws FileNotFoundException, CephNotDirectoryException {
rlock.lock();
try {
@@ -479,6 +496,23 @@ public class CephMount {
private static native int native_ceph_chmod(long mountp, String path, int mode);
/**
+ * Change file mode of an open file.
+ *
+ * @param fd The open file descriptor to change the mode bits on.
+ * @param mode New mode bits.
+ */
+ public void fchmod(int fd, int mode) {
+ rlock.lock();
+ try {
+ native_ceph_fchmod(instance_ptr, fd, mode);
+ } finally {
+ rlock.unlock();
+ }
+ }
+
+ private static native int native_ceph_fchmod(long mountp, int fd, int mode);
+
+ /**
* Truncate a file to a specified length.
*
* @param path Path of the file.
@@ -852,6 +886,23 @@ public class CephMount {
private static native int native_ceph_get_file_stripe_unit(long mountp, int fd);
/**
+ * Get the name of the pool a file is stored in.
+ *
+ * @param fd An open file descriptor.
+ * @return The pool name.
+ */
+ public String get_file_pool_name(int fd) {
+ rlock.lock();
+ try {
+ return native_ceph_get_file_pool_name(instance_ptr, fd);
+ } finally {
+ rlock.unlock();
+ }
+ }
+
+ private static native String native_ceph_get_file_pool_name(long mountp, int fd);
+
+ /**
* Get the replication of a file.
*
* @param fd The file descriptor.
diff --git a/src/java/native/libcephfs_jni.cc b/src/java/native/libcephfs_jni.cc
index 2fef171a976..d5189fbf8da 100644
--- a/src/java/native/libcephfs_jni.cc
+++ b/src/java/native/libcephfs_jni.cc
@@ -173,7 +173,7 @@ static void cephThrowNullArg(JNIEnv *env, const char *msg)
static void cephThrowOutOfMemory(JNIEnv *env, const char *msg)
{
- THROW(env, "java/lang/OutOfMemoryException", msg);
+ THROW(env, "java/lang/OutOfMemoryError", msg);
}
static void cephThrowInternal(JNIEnv *env, const char *msg)
@@ -1189,6 +1189,35 @@ JNIEXPORT jint JNICALL Java_com_ceph_fs_CephMount_native_1ceph_1symlink
return ret;
}
+static void fill_cephstat(JNIEnv *env, jobject j_cephstat, struct stat *st)
+{
+ env->SetIntField(j_cephstat, cephstat_mode_fid, st->st_mode);
+ env->SetIntField(j_cephstat, cephstat_uid_fid, st->st_uid);
+ env->SetIntField(j_cephstat, cephstat_gid_fid, st->st_gid);
+ env->SetLongField(j_cephstat, cephstat_size_fid, st->st_size);
+ env->SetLongField(j_cephstat, cephstat_blksize_fid, st->st_blksize);
+ env->SetLongField(j_cephstat, cephstat_blocks_fid, st->st_blocks);
+
+ long long time = st->st_mtim.tv_sec;
+ time *= 1000;
+ time += st->st_mtim.tv_nsec / 1000000;
+ env->SetLongField(j_cephstat, cephstat_m_time_fid, time);
+
+ time = st->st_atim.tv_sec;
+ time *= 1000;
+ time += st->st_atim.tv_nsec / 1000000;
+ env->SetLongField(j_cephstat, cephstat_a_time_fid, time);
+
+ env->SetBooleanField(j_cephstat, cephstat_is_file_fid,
+ S_ISREG(st->st_mode) ? JNI_TRUE : JNI_FALSE);
+
+ env->SetBooleanField(j_cephstat, cephstat_is_directory_fid,
+ S_ISDIR(st->st_mode) ? JNI_TRUE : JNI_FALSE);
+
+ env->SetBooleanField(j_cephstat, cephstat_is_symlink_fid,
+ S_ISLNK(st->st_mode) ? JNI_TRUE : JNI_FALSE);
+}
+
/*
* Class: com_ceph_fs_CephMount
* Method: native_ceph_lstat
@@ -1200,7 +1229,6 @@ JNIEXPORT jint JNICALL Java_com_ceph_fs_CephMount_native_1ceph_1lstat
struct ceph_mount_info *cmount = get_ceph_mount(j_mntp);
CephContext *cct = ceph_get_mount_context(cmount);
const char *c_path;
- long long time;
struct stat st;
int ret;
@@ -1227,31 +1255,49 @@ JNIEXPORT jint JNICALL Java_com_ceph_fs_CephMount_native_1ceph_1lstat
return ret;
}
- env->SetIntField(j_cephstat, cephstat_mode_fid, st.st_mode);
- env->SetIntField(j_cephstat, cephstat_uid_fid, st.st_uid);
- env->SetIntField(j_cephstat, cephstat_gid_fid, st.st_gid);
- env->SetLongField(j_cephstat, cephstat_size_fid, st.st_size);
- env->SetLongField(j_cephstat, cephstat_blksize_fid, st.st_blksize);
- env->SetLongField(j_cephstat, cephstat_blocks_fid, st.st_blocks);
+ fill_cephstat(env, j_cephstat, &st);
- time = st.st_mtim.tv_sec;
- time *= 1000;
- time += st.st_mtim.tv_nsec / 1000000;
- env->SetLongField(j_cephstat, cephstat_m_time_fid, time);
+ return ret;
+}
- time = st.st_atim.tv_sec;
- time *= 1000;
- time += st.st_atim.tv_nsec / 1000000;
- env->SetLongField(j_cephstat, cephstat_a_time_fid, time);
+/*
+ * Class: com_ceph_fs_CephMount
+ * Method: native_ceph_stat
+ * Signature: (JLjava/lang/String;Lcom/ceph/fs/CephStat;)I
+ */
+JNIEXPORT jint JNICALL Java_com_ceph_fs_CephMount_native_1ceph_1stat
+ (JNIEnv *env, jclass clz, jlong j_mntp, jstring j_path, jobject j_cephstat)
+{
+ struct ceph_mount_info *cmount = get_ceph_mount(j_mntp);
+ CephContext *cct = ceph_get_mount_context(cmount);
+ const char *c_path;
+ struct stat st;
+ int ret;
- env->SetBooleanField(j_cephstat, cephstat_is_file_fid,
- S_ISREG(st.st_mode) ? JNI_TRUE : JNI_FALSE);
+ CHECK_ARG_NULL(j_path, "@path is null", -1);
+ CHECK_ARG_NULL(j_cephstat, "@stat is null", -1);
+ CHECK_MOUNTED(cmount, -1);
- env->SetBooleanField(j_cephstat, cephstat_is_directory_fid,
- S_ISDIR(st.st_mode) ? JNI_TRUE : JNI_FALSE);
+ c_path = env->GetStringUTFChars(j_path, NULL);
+ if (!c_path) {
+ cephThrowInternal(env, "Failed to pin memory");
+ return -1;
+ }
- env->SetBooleanField(j_cephstat, cephstat_is_symlink_fid,
- S_ISLNK(st.st_mode) ? JNI_TRUE : JNI_FALSE);
+ ldout(cct, 10) << "jni: lstat: path " << c_path << dendl;
+
+ ret = ceph_stat(cmount, c_path, &st);
+
+ ldout(cct, 10) << "jni: lstat exit ret " << ret << dendl;
+
+ env->ReleaseStringUTFChars(j_path, c_path);
+
+ if (ret) {
+ handle_error(env, ret);
+ return ret;
+ }
+
+ fill_cephstat(env, j_cephstat, &st);
return ret;
}
@@ -1340,6 +1386,32 @@ JNIEXPORT jint JNICALL Java_com_ceph_fs_CephMount_native_1ceph_1chmod
/*
* Class: com_ceph_fs_CephMount
+ * Method: native_ceph_fchmod
+ * Signature: (JII)I
+ */
+JNIEXPORT jint JNICALL Java_com_ceph_fs_CephMount_native_1ceph_1fchmod
+ (JNIEnv *env, jclass clz, jlong j_mntp, jint j_fd, jint j_mode)
+{
+ struct ceph_mount_info *cmount = get_ceph_mount(j_mntp);
+ CephContext *cct = ceph_get_mount_context(cmount);
+ int ret;
+
+ CHECK_MOUNTED(cmount, -1);
+
+ ldout(cct, 10) << "jni: fchmod: fd " << (int)j_fd << " mode " << (int)j_mode << dendl;
+
+ ret = ceph_fchmod(cmount, (int)j_fd, (int)j_mode);
+
+ ldout(cct, 10) << "jni: fchmod: exit ret " << ret << dendl;
+
+ if (ret)
+ handle_error(env, ret);
+
+ return ret;
+}
+
+/*
+ * Class: com_ceph_fs_CephMount
* Method: native_ceph_truncate
* Signature: (JLjava/lang/String;J)I
*/
@@ -2385,6 +2457,67 @@ JNIEXPORT jint JNICALL Java_com_ceph_fs_CephMount_native_1ceph_1get_1file_1repli
/*
* Class: com_ceph_fs_CephMount
+ * Method: native_ceph_get_file_pool_name
+ * Signature: (JI)Ljava/lang/String;
+ */
+JNIEXPORT jstring JNICALL Java_com_ceph_fs_CephMount_native_1ceph_1get_1file_1pool_1name
+ (JNIEnv *env, jclass clz, jlong j_mntp, jint j_fd)
+{
+ struct ceph_mount_info *cmount = get_ceph_mount(j_mntp);
+ CephContext *cct = ceph_get_mount_context(cmount);
+ jstring pool = NULL;
+ int ret, buflen = 0;
+ char *buf = NULL;
+
+ CHECK_MOUNTED(cmount, NULL);
+
+ ldout(cct, 10) << "jni: get_file_pool_name: fd " << (int)j_fd << dendl;
+
+ for (;;) {
+ /* get pool name length (len==0) */
+ ret = ceph_get_file_pool_name(cmount, (int)j_fd, NULL, 0);
+ if (ret < 0)
+ break;
+
+ /* allocate buffer */
+ if (buf)
+ delete [] buf;
+ buflen = ret;
+ buf = new (std::nothrow) char[buflen+1]; /* +1 for '\0' */
+ if (!buf) {
+ cephThrowOutOfMemory(env, "head allocation failed");
+ goto out;
+ }
+ memset(buf, 0, (buflen+1)*sizeof(*buf));
+
+ /* handle zero-length pool name!? */
+ if (buflen == 0)
+ break;
+
+ /* fill buffer */
+ ret = ceph_get_file_pool_name(cmount, (int)j_fd, buf, buflen);
+ if (ret == -ERANGE) /* size changed! */
+ continue;
+ else
+ break;
+ }
+
+ ldout(cct, 10) << "jni: get_file_pool_name: ret " << ret << dendl;
+
+ if (ret < 0)
+ handle_error(env, ret);
+ else
+ pool = env->NewStringUTF(buf);
+
+out:
+ if (buf)
+ delete [] buf;
+
+ return pool;
+}
+
+/*
+ * Class: com_ceph_fs_CephMount
* Method: native_ceph_localize_reads
* Signature: (JZ)I
*/
diff --git a/src/java/test/com/ceph/fs/CephMountTest.java b/src/java/test/com/ceph/fs/CephMountTest.java
index 984c2cb7377..9d205121cc5 100644
--- a/src/java/test/com/ceph/fs/CephMountTest.java
+++ b/src/java/test/com/ceph/fs/CephMountTest.java
@@ -451,7 +451,9 @@ public class CephMountTest {
}
/*
- * test_stat covers lstat and fstat
+ * test_stat covers lstat and fstat and stat.
+ *
+ * TODO: create test that for lstat vs stat with symlink follow/nofollow.
*/
@Test
@@ -469,6 +471,10 @@ public class CephMountTest {
assertTrue(orig_st.blksize > 0);
assertTrue(orig_st.blocks > 0);
+ /* now try stat */
+ CephStat stat_st = new CephStat();
+ mount.stat(path, stat_st);
+
/* now try fstat */
CephStat other_st = new CephStat();
fd = mount.open(path, CephMount.O_RDWR, 0);
@@ -477,12 +483,40 @@ public class CephMountTest {
mount.unlink(path);
+ /* compare to fstat results */
assertTrue(orig_st.mode == other_st.mode);
assertTrue(orig_st.uid == other_st.uid);
assertTrue(orig_st.gid == other_st.gid);
assertTrue(orig_st.size == other_st.size);
assertTrue(orig_st.blksize == other_st.blksize);
assertTrue(orig_st.blocks == other_st.blocks);
+
+ /* compare to stat results */
+ assertTrue(orig_st.mode == stat_st.mode);
+ assertTrue(orig_st.uid == stat_st.uid);
+ assertTrue(orig_st.gid == stat_st.gid);
+ assertTrue(orig_st.size == stat_st.size);
+ assertTrue(orig_st.blksize == stat_st.blksize);
+ assertTrue(orig_st.blocks == stat_st.blocks);
+ }
+
+ /*
+ * stat
+ */
+
+ @Test(expected=NullPointerException.class)
+ public void test_stat_null_path() throws Exception {
+ mount.stat(null, new CephStat());
+ }
+
+ @Test(expected=NullPointerException.class)
+ public void test_stat_null_stat() throws Exception {
+ mount.stat("/path", null);
+ }
+
+ @Test(expected=FileNotFoundException.class)
+ public void test_stat_null_dne() throws Exception {
+ mount.stat("/path/does/not/exist", new CephStat());
}
@Test(expected=CephNotDirectoryException.class)
@@ -582,6 +616,36 @@ public class CephMountTest {
}
/*
+ * fchmod
+ */
+
+ @Test
+ public void test_fchmod() throws Exception {
+ /* create a file */
+ String path = makePath();
+ int fd = createFile(path, 1);
+
+ CephStat st = new CephStat();
+ mount.lstat(path, st);
+
+ /* flip a bit */
+ int mode = st.mode;
+ if ((mode & 1) != 0)
+ mode -= 1;
+ else
+ mode += 1;
+
+ mount.fchmod(fd, mode);
+ mount.close(fd);
+
+ CephStat st2 = new CephStat();
+ mount.lstat(path, st2);
+ assertTrue(st2.mode == mode);
+
+ mount.unlink(path);
+ }
+
+ /*
* truncate
*/
@@ -867,4 +931,21 @@ public class CephMountTest {
assertTrue(poolid >= 0);
assertTrue(mount.get_pool_replication(poolid) > 0);
}
+
+ @Test
+ public void test_get_file_pool_name() throws Exception {
+ String path = makePath();
+ int fd = createFile(path, 1);
+ String pool = mount.get_file_pool_name(fd);
+ mount.close(fd);
+ assertTrue(pool != null);
+ /* assumes using default data pool "data" */
+ assertTrue(pool.compareTo("data") == 0);
+ mount.unlink(path);
+ }
+
+ @Test(expected=IOException.class)
+ public void test_get_file_pool_name_ebadf() throws Exception {
+ String pool = mount.get_file_pool_name(-40);
+ }
}
diff --git a/src/java/test/com/ceph/fs/CephUnmountedTest.java b/src/java/test/com/ceph/fs/CephUnmountedTest.java
index ae4d41e1e98..eb95e69fb03 100644
--- a/src/java/test/com/ceph/fs/CephUnmountedTest.java
+++ b/src/java/test/com/ceph/fs/CephUnmountedTest.java
@@ -78,6 +78,12 @@ public class CephUnmountedTest {
}
@Test(expected=CephNotMountedException.class)
+ public void test_stat() throws Exception {
+ CephStat stat = new CephStat();
+ mount.stat("/a/path", stat);
+ }
+
+ @Test(expected=CephNotMountedException.class)
public void test_lstat() throws Exception {
CephStat stat = new CephStat();
mount.lstat("/a/path", stat);
@@ -145,4 +151,14 @@ public class CephUnmountedTest {
public void test_get_pool_replication() throws Exception {
mount.get_pool_replication(1);
}
+
+ @Test(expected=CephNotMountedException.class)
+ public void test_fchmod() throws Exception {
+ mount.fchmod(1, 0);
+ }
+
+ @Test(expected=CephNotMountedException.class)
+ public void test_chmod() throws Exception {
+ mount.chmod("/foo", 0);
+ }
}
diff --git a/src/mon/Elector.cc b/src/mon/Elector.cc
index e2ffa6bd571..199eaeae538 100644
--- a/src/mon/Elector.cc
+++ b/src/mon/Elector.cc
@@ -271,6 +271,8 @@ void Elector::handle_victory(MMonElection *m)
assert(from < mon->rank);
assert(m->epoch % 2 == 0);
+ leader_acked = -1;
+
// i should have seen this election if i'm getting the victory.
if (m->epoch != epoch + 1) {
dout(5) << "woah, that's a funny epoch, i must have rebooted. bumping and re-starting!" << dendl;
diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc
index 143ee65ed97..699db8968f1 100644
--- a/src/mon/Monitor.cc
+++ b/src/mon/Monitor.cc
@@ -736,7 +736,7 @@ void Monitor::reset()
{
dout(10) << "reset" << dendl;
- timecheck_cleanup();
+ timecheck_finish();
leader_since = utime_t();
if (!quorum.empty()) {
@@ -1189,7 +1189,7 @@ void Monitor::win_election(epoch_t epoch, set<int>& active, uint64_t features)
finish_election();
if (monmap->size() > 1)
- timecheck();
+ timecheck_start();
}
void Monitor::lose_election(epoch_t epoch, set<int> &q, int l, uint64_t features)
@@ -1213,6 +1213,7 @@ void Monitor::lose_election(epoch_t epoch, set<int> &q, int l, uint64_t features
void Monitor::finish_election()
{
+ timecheck_finish();
exited_quorum = utime_t();
finish_contexts(g_ceph_context, waitfor_quorum);
finish_contexts(g_ceph_context, maybe_wait_for_quorum);
@@ -2240,18 +2241,98 @@ bool Monitor::_ms_dispatch(Message *m)
return ret;
}
+void Monitor::timecheck_start()
+{
+ dout(10) << __func__ << dendl;
+ timecheck_cleanup();
+ timecheck_start_round();
+}
+
+void Monitor::timecheck_finish()
+{
+ dout(10) << __func__ << dendl;
+ timecheck_cleanup();
+}
+
+void Monitor::timecheck_start_round()
+{
+ dout(10) << __func__ << " curr " << timecheck_round << dendl;
+ assert(is_leader());
+
+ if (monmap->size() == 1) {
+ assert(0 == "We are alone; this shouldn't have been scheduled!");
+ return;
+ }
+
+ if (timecheck_round % 2) {
+ dout(10) << __func__ << " there's a timecheck going on" << dendl;
+ utime_t curr_time = ceph_clock_now(g_ceph_context);
+ double max = g_conf->mon_timecheck_interval*3;
+ if (curr_time - timecheck_round_start > max) {
+ dout(10) << __func__ << " keep current round going" << dendl;
+ goto out;
+ } else {
+ dout(10) << __func__
+ << " finish current timecheck and start new" << dendl;
+ timecheck_cancel_round();
+ }
+ }
+
+ assert(timecheck_round % 2 == 0);
+ timecheck_acks = 0;
+ timecheck_round ++;
+ timecheck_round_start = ceph_clock_now(g_ceph_context);
+ dout(10) << __func__ << " new " << timecheck_round << dendl;
+
+ timecheck();
+out:
+ dout(10) << __func__ << " setting up next event" << dendl;
+ timecheck_event = new C_TimeCheck(this);
+ timer.add_event_after(g_conf->mon_timecheck_interval, timecheck_event);
+}
+
+void Monitor::timecheck_finish_round(bool success)
+{
+ dout(10) << __func__ << " curr " << timecheck_round << dendl;
+ assert(timecheck_round % 2);
+ timecheck_round ++;
+ timecheck_round_start = utime_t();
+
+ if (success) {
+ assert(timecheck_waiting.size() == 0);
+ assert(timecheck_acks == quorum.size());
+ timecheck_report();
+ return;
+ }
+
+ dout(10) << __func__ << " " << timecheck_waiting.size()
+ << " peers still waiting:";
+ for (map<entity_inst_t,utime_t>::iterator p = timecheck_waiting.begin();
+ p != timecheck_waiting.end(); ++p) {
+ *_dout << " " << p->first.name;
+ }
+ *_dout << dendl;
+ timecheck_waiting.clear();
+
+ dout(10) << __func__ << " finished to " << timecheck_round << dendl;
+}
+
+void Monitor::timecheck_cancel_round()
+{
+ timecheck_finish_round(false);
+}
+
void Monitor::timecheck_cleanup()
{
timecheck_round = 0;
timecheck_acks = 0;
+ timecheck_round_start = utime_t();
if (timecheck_event) {
timer.cancel_event(timecheck_event);
timecheck_event = NULL;
}
-
- if (timecheck_waiting.size() > 0)
- timecheck_waiting.clear();
+ timecheck_waiting.clear();
timecheck_skews.clear();
timecheck_latencies.clear();
}
@@ -2300,20 +2381,12 @@ void Monitor::timecheck()
{
dout(10) << __func__ << dendl;
assert(is_leader());
-
if (monmap->size() == 1) {
- assert(0 == "We are alone; this shouldn't have been scheduled!");
+ assert(0 == "We are alone; we shouldn't have gotten here!");
return;
}
+ assert(timecheck_round % 2 != 0);
- if ((timecheck_round % 2) != 0) {
- dout(15) << __func__
- << " timecheck still in progress; laggy monitors maybe?"
- << dendl;
- goto out;
- }
-
- timecheck_round++;
timecheck_acks = 1; // we ack ourselves
dout(10) << __func__ << " start timecheck epoch " << get_epoch()
@@ -2336,12 +2409,6 @@ void Monitor::timecheck()
dout(10) << __func__ << " send " << *m << " to " << inst << dendl;
messenger->send_message(m, inst);
}
-
-out:
- dout(10) << __func__ << " setting up next event and timeout" << dendl;
- timecheck_event = new C_TimeCheck(this);
-
- timer.add_event_after(g_conf->mon_timecheck_interval, timecheck_event);
}
health_status_t Monitor::timecheck_status(ostringstream &ss,
@@ -2394,9 +2461,7 @@ void Monitor::handle_timecheck_leader(MTimeCheck *m)
dout(1) << __func__ << " our clock was readjusted --"
<< " bump round and drop current check"
<< dendl;
- timecheck_round++;
- timecheck_acks = 0;
- timecheck_waiting.clear();
+ timecheck_cancel_round();
return;
}
@@ -2481,8 +2546,7 @@ void Monitor::handle_timecheck_leader(MTimeCheck *m)
assert(timecheck_skews.size() == timecheck_acks);
assert(timecheck_waiting.size() == 0);
// everyone has acked, so bump the round to finish it.
- timecheck_round++;
- timecheck_report();
+ timecheck_finish_round();
}
}
diff --git a/src/mon/Monitor.h b/src/mon/Monitor.h
index 9716e351348..c7704bb16da 100644
--- a/src/mon/Monitor.h
+++ b/src/mon/Monitor.h
@@ -238,6 +238,7 @@ private:
// finished.
version_t timecheck_round;
unsigned int timecheck_acks;
+ utime_t timecheck_round_start;
/**
* Time Check event.
*/
@@ -247,10 +248,15 @@ private:
Monitor *mon;
C_TimeCheck(Monitor *m) : mon(m) { }
void finish(int r) {
- mon->timecheck();
+ mon->timecheck_start_round();
}
};
+ void timecheck_start();
+ void timecheck_finish();
+ void timecheck_start_round();
+ void timecheck_finish_round(bool success = true);
+ void timecheck_cancel_round();
void timecheck_cleanup();
void timecheck_report();
void timecheck();
diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc
index 5786713e043..3d11cfffc0f 100644
--- a/src/mon/OSDMonitor.cc
+++ b/src/mon/OSDMonitor.cc
@@ -1457,6 +1457,8 @@ void OSDMonitor::tick()
* ratio set by g_conf->mon_osd_min_in_ratio. So it's not really up to us.
*/
if (can_mark_out(-1)) {
+ set<int> down_cache; // quick cache of down subtrees
+
map<int,utime_t>::iterator i = down_pending_out.begin();
while (i != down_pending_out.end()) {
int o = i->first;
@@ -1483,6 +1485,20 @@ void OSDMonitor::tick()
grace += my_grace;
}
+ // is this an entire large subtree down?
+ if (g_conf->mon_osd_down_out_subtree_limit.length()) {
+ int type = osdmap.crush->get_type_id(g_conf->mon_osd_down_out_subtree_limit.c_str());
+ if (type > 0) {
+ if (osdmap.containing_subtree_is_down(g_ceph_context, o, type, &down_cache)) {
+ dout(10) << "tick entire containing " << g_conf->mon_osd_down_out_subtree_limit
+ << " subtree for osd." << o << " is down; resetting timer" << dendl;
+ // reset timer, too.
+ down_pending_out[o] = now;
+ continue;
+ }
+ }
+ }
+
if (g_conf->mon_osd_down_out_interval > 0 &&
down.sec() >= grace) {
dout(10) << "tick marking osd." << o << " OUT after " << down
diff --git a/src/osd/OSDMap.cc b/src/osd/OSDMap.cc
index 439ff06505a..c7d044ac6fd 100644
--- a/src/osd/OSDMap.cc
+++ b/src/osd/OSDMap.cc
@@ -172,6 +172,68 @@ int OSDMap::Incremental::identify_osd(uuid_d u) const
return -1;
}
+bool OSDMap::subtree_is_down(int id, set<int> *down_cache) const
+{
+ if (id >= 0)
+ return is_down(id);
+
+ if (down_cache &&
+ down_cache->count(id)) {
+ return true;
+ }
+
+ list<int> children;
+ crush->get_children(id, &children);
+ for (list<int>::iterator p = children.begin(); p != children.end(); ++p) {
+ if (!subtree_is_down(*p, down_cache)) {
+ return false;
+ }
+ }
+ if (down_cache) {
+ down_cache->insert(id);
+ }
+ return true;
+}
+
+bool OSDMap::containing_subtree_is_down(CephContext *cct, int id, int subtree_type, set<int> *down_cache) const
+{
+ // use a stack-local down_cache if we didn't get one from the
+ // caller. then at least this particular call will avoid duplicated
+ // work.
+ set<int> local_down_cache;
+ if (!down_cache) {
+ down_cache = &local_down_cache;
+ }
+
+ if (!subtree_is_down(id, down_cache)) {
+ ldout(cct, 30) << "containing_subtree_is_down(" << id << ") = false" << dendl;
+ return false;
+ }
+
+ int current = id;
+ while (true) {
+ // invariant: current subtree is known to be down.
+ int type;
+ if (current >= 0) {
+ type = 0;
+ } else {
+ type = crush->get_bucket_type(current);
+ }
+ assert(type >= 0);
+
+ // is this a big enough subtree to be done?
+ if (type >= subtree_type) {
+ ldout(cct, 30) << "containing_subtree_is_down(" << id << ") = true ... " << type << " >= " << subtree_type << dendl;
+ return true;
+ }
+
+ int r = crush->get_immediate_parent_id(current, &current);
+ if (r < 0) {
+ return false;
+ }
+ }
+}
+
void OSDMap::Incremental::encode_client_old(bufferlist& bl) const
{
__u16 v = 5;
diff --git a/src/osd/OSDMap.h b/src/osd/OSDMap.h
index 5105fc7ab0e..f3f84f0b470 100644
--- a/src/osd/OSDMap.h
+++ b/src/osd/OSDMap.h
@@ -316,6 +316,12 @@ private:
bool is_in(int osd) const {
return exists(osd) && !is_out(osd);
}
+
+ /**
+ * check if an entire crush subtre is down
+ */
+ bool subtree_is_down(int id, set<int> *down_cache) const;
+ bool containing_subtree_is_down(CephContext *cct, int osd, int subtree_type, set<int> *down_cache) const;
int identify_osd(const entity_addr_t& addr) const;
int identify_osd(const uuid_d& u) const;
diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc
index 008462af426..55e420d6e74 100644
--- a/src/osd/osd_types.cc
+++ b/src/osd/osd_types.cc
@@ -509,6 +509,7 @@ void pg_pool_t::dump(Formatter *f) const
f->dump_unsigned("flags", get_flags());
f->dump_int("type", get_type());
f->dump_int("size", get_size());
+ f->dump_int("min_size", get_min_size());
f->dump_int("crush_ruleset", get_crush_ruleset());
f->dump_int("object_hash", get_object_hash());
f->dump_int("pg_num", get_pg_num());
@@ -829,6 +830,7 @@ ostream& operator<<(ostream& out, const pg_pool_t& p)
{
out << p.get_type_name()
<< " size " << p.get_size()
+ << " min_size " << p.get_min_size()
<< " crush_ruleset " << p.get_crush_ruleset()
<< " object_hash " << p.get_object_hash_name()
<< " pg_num " << p.get_pg_num()
diff --git a/src/ps-ceph.pl b/src/ps-ceph.pl
index dc236296494..03fc6061373 100755
--- a/src/ps-ceph.pl
+++ b/src/ps-ceph.pl
@@ -18,6 +18,7 @@ sub is_ceph_proc {
return 1 if $cmdline =~ /\bceph\b/;
return 1 if $cmdline =~ /\bceph-fuse\b/;
+ return 1 if $cmdline =~ /\brbd-fuse\b/;
return 1 if $cmdline =~ /\bceph-mds\b/;
return 1 if $cmdline =~ /\bceph-mon\b/;
return 1 if $cmdline =~ /\bceph-osd\b/;
diff --git a/src/rbd.cc b/src/rbd.cc
index 833188ae33c..dd56bc9309e 100644
--- a/src/rbd.cc
+++ b/src/rbd.cc
@@ -1428,8 +1428,16 @@ static int do_kernel_add(const char *poolname, const char *imgname,
// modprobe the rbd module if /sys/bus/rbd doesn't exist
struct stat sb;
- if ((stat("/sys/bus/rbd", &sb) < 0) || (!S_ISDIR(sb.st_mode)))
- system("/sbin/modprobe rbd");
+ if ((stat("/sys/bus/rbd", &sb) < 0) || (!S_ISDIR(sb.st_mode))) {
+ r = system("/sbin/modprobe rbd");
+ if (r) {
+ if (r < 0)
+ cerr << "rbd: error executing modprobe as shell command!" << std::endl;
+ else
+ cerr << "rbd: modprobe rbd failed! (" << r << ")" <<std::endl;
+ return r;
+ }
+ }
// write to /sys/bus/rbd/add
int fd = open("/sys/bus/rbd/add", O_WRONLY);
@@ -1448,8 +1456,16 @@ static int do_kernel_add(const char *poolname, const char *imgname,
close(fd);
// let udevadm do its job before we return
- if (udevadm_settle)
- system("/sbin/udevadm settle");
+ if (udevadm_settle) {
+ r = system("/sbin/udevadm settle");
+ if (r) {
+ if (r < 0)
+ cerr << "rbd: error executing udevadm as shell command!" << std::endl;
+ else
+ cerr << "rbd: '/sbin/udevadm settle' failed! (" << r << ")" <<std::endl;
+ return r;
+ }
+ }
return r;
}
@@ -1664,8 +1680,16 @@ static int do_kernel_rm(const char *dev)
r = close(fd);
// let udevadm finish, if present
- if (udevadm_settle)
- system("/sbin/udevadm settle");
+ if (udevadm_settle){
+ r = system("/sbin/udevadm settle");
+ if (r) {
+ if (r < 0)
+ cerr << "rbd: error executing udevadm as shell command!" << std::endl;
+ else
+ cerr << "rbd: '/sbin/udevadm settle' failed! (" << r << ")" <<std::endl;
+ return r;
+ }
+ }
if (r < 0)
r = -errno;
diff --git a/src/rbd_fuse/rbd-fuse.c b/src/rbd_fuse/rbd-fuse.c
new file mode 100644
index 00000000000..748976aabaf
--- /dev/null
+++ b/src/rbd_fuse/rbd-fuse.c
@@ -0,0 +1,752 @@
+/*
+ * rbd-fuse
+ */
+#define FUSE_USE_VERSION 26
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <dirent.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <fuse.h>
+#include <pthread.h>
+#include <string.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <inttypes.h>
+
+#include "include/rbd/librbd.h"
+
+static int gotrados = 0;
+char *pool_name;
+rados_t cluster;
+rados_ioctx_t ioctx;
+
+static pthread_mutex_t readdir_lock;
+
+struct rbd_stat {
+ u_char valid;
+ rbd_image_info_t rbd_info;
+};
+
+struct rbd_options {
+ char *ceph_config;
+ char *pool_name;
+};
+
+struct rbd_image {
+ char *image_name;
+ struct rbd_image *next;
+};
+struct rbd_image *rbd_images;
+
+struct rbd_openimage {
+ char *image_name;
+ rbd_image_t image;
+ struct rbd_stat rbd_stat;
+};
+#define MAX_RBD_IMAGES 128
+struct rbd_openimage opentbl[MAX_RBD_IMAGES];
+
+struct rbd_options rbd_options = {"/etc/ceph/ceph.conf", "rbd"};
+
+#define rbdsize(fd) opentbl[fd].rbd_stat.rbd_info.size
+#define rbdblksize(fd) opentbl[fd].rbd_stat.rbd_info.obj_size
+#define rbdblkcnt(fd) opentbl[fd].rbd_stat.rbd_info.num_objs
+
+uint64_t imagesize = 1024ULL * 1024 * 1024;
+uint64_t imageorder = 22ULL;
+uint64_t imagefeatures = 1ULL;
+
+// Minimize calls to rbd_list: marks bracketing of opendir/<ops>/releasedir
+int in_opendir;
+
+/* prototypes */
+int connect_to_cluster(rados_t *pcluster);
+void enumerate_images(struct rbd_image **head);
+int open_rbd_image(const char *image_name);
+int find_openrbd(const char *path);
+
+void simple_err(const char *msg, int err);
+
+void
+enumerate_images(struct rbd_image **head)
+{
+ char *ibuf;
+ size_t ibuf_len;
+ struct rbd_image *im, *next;
+ char *ip;
+ int actual_len;
+
+ if (*head != NULL) {
+ for (im = *head; im != NULL;) {
+ next = im->next;
+ free(im);
+ im = next;
+ }
+ *head = NULL;
+ }
+
+ ibuf_len = 1024;
+ ibuf = malloc(ibuf_len);
+ actual_len = rbd_list(ioctx, ibuf, &ibuf_len);
+ if (actual_len < 0) {
+ simple_err("rbd_list: error %d\n", actual_len);
+ return;
+ }
+
+ fprintf(stderr, "pool %s: ", pool_name);
+ for (ip = ibuf; *ip != '\0' && ip < &ibuf[actual_len];
+ ip += strlen(ip) + 1) {
+ fprintf(stderr, "%s, ", ip);
+ im = malloc(sizeof(*im));
+ im->image_name = ip;
+ im->next = *head;
+ *head = im;
+ }
+ fprintf(stderr, "\n");
+ return;
+}
+
+int
+find_openrbd(const char *path)
+{
+ int i;
+
+ /* find in opentbl[] entry if already open */
+ for (i = 0; i < MAX_RBD_IMAGES; i++) {
+ if ((opentbl[i].image_name != NULL) &&
+ (strcmp(opentbl[i].image_name, path) == 0)) {
+ return i;
+ break;
+ }
+ }
+ return -1;
+}
+
+int
+open_rbd_image(const char *image_name)
+{
+ struct rbd_image *im;
+ struct rbd_openimage *rbd;
+ int fd, i;
+ int ret;
+
+ if (image_name == (char *)NULL)
+ return -1;
+
+ // relies on caller to keep rbd_images up to date
+ for (im = rbd_images; im != NULL; i++, im = im->next) {
+ if (strcmp(im->image_name, image_name) == 0) {
+ break;
+ }
+ }
+ if (im == NULL)
+ return -1;
+
+ /* find in opentbl[] entry if already open */
+ if ((fd = find_openrbd(image_name)) != -1) {
+ rbd = &opentbl[fd];
+ } else {
+ // allocate an opentbl[] and open the image
+ for (i = 0; i < MAX_RBD_IMAGES; i++) {
+ if (opentbl[i].image == NULL) {
+ fd = i;
+ rbd = &opentbl[fd];
+ rbd->image_name = strdup(image_name);
+ break;
+ }
+ }
+ if (i == MAX_RBD_IMAGES)
+ return -1;
+ ret = rbd_open(ioctx, rbd->image_name, &(rbd->image), NULL);
+ if (ret < 0) {
+ simple_err("open_rbd_image: can't open: ", ret);
+ return ret;
+ }
+ }
+ rbd_stat(rbd->image, &(rbd->rbd_stat.rbd_info),
+ sizeof(rbd_image_info_t));
+ rbd->rbd_stat.valid = 1;
+ return fd;
+}
+
+static void
+iter_images(void *cookie,
+ void (*iter)(void *cookie, const char *image))
+{
+ struct rbd_image *im;
+
+ pthread_mutex_lock(&readdir_lock);
+
+ for (im = rbd_images; im != NULL; im = im->next)
+ iter(cookie, im->image_name);
+ pthread_mutex_unlock(&readdir_lock);
+}
+
+static void count_images_cb(void *cookie, const char *image)
+{
+ (*((unsigned int *)cookie))++;
+}
+
+static int count_images(void)
+{
+ unsigned int count = 0;
+
+ pthread_mutex_lock(&readdir_lock);
+ enumerate_images(&rbd_images);
+ pthread_mutex_unlock(&readdir_lock);
+
+ iter_images(&count, count_images_cb);
+ return count;
+}
+
+static int rbdfs_getattr(const char *path, struct stat *stbuf)
+{
+ int fd;
+ time_t now;
+
+ if (!gotrados)
+ return -ENXIO;
+
+ if (path[0] == 0)
+ return -ENOENT;
+
+ memset(stbuf, 0, sizeof(struct stat));
+
+ if (strcmp(path, "/") == 0) {
+
+ now = time(NULL);
+ stbuf->st_mode = S_IFDIR + 0755;
+ stbuf->st_nlink = 2+count_images();
+ stbuf->st_uid = getuid();
+ stbuf->st_gid = getgid();
+ stbuf->st_size = 1024;
+ stbuf->st_blksize = 1024;
+ stbuf->st_blocks = 1;
+ stbuf->st_atime = now;
+ stbuf->st_mtime = now;
+ stbuf->st_ctime = now;
+
+ return 0;
+ }
+
+ if (!in_opendir) {
+ pthread_mutex_lock(&readdir_lock);
+ enumerate_images(&rbd_images);
+ pthread_mutex_unlock(&readdir_lock);
+ }
+ fd = open_rbd_image(path + 1);
+ if (fd < 0)
+ return -ENOENT;
+
+ now = time(NULL);
+ stbuf->st_mode = S_IFREG | 0666;
+ stbuf->st_nlink = 1;
+ stbuf->st_uid = getuid();
+ stbuf->st_gid = getgid();
+ stbuf->st_size = rbdsize(fd);
+ stbuf->st_blksize = rbdblksize(fd);
+ stbuf->st_blocks = rbdblkcnt(fd);
+ stbuf->st_atime = now;
+ stbuf->st_mtime = now;
+ stbuf->st_ctime = now;
+
+ return 0;
+}
+
+
+static int rbdfs_open(const char *path, struct fuse_file_info *fi)
+{
+ int fd;
+
+ if (!gotrados)
+ return -ENXIO;
+
+ if (path[0] == 0)
+ return -ENOENT;
+
+ pthread_mutex_lock(&readdir_lock);
+ enumerate_images(&rbd_images);
+ pthread_mutex_unlock(&readdir_lock);
+ fd = open_rbd_image(path + 1);
+ if (fd < 0)
+ return -ENOENT;
+
+ fi->fh = fd;
+ return 0;
+}
+
+static int rbdfs_read(const char *path, char *buf, size_t size,
+ off_t offset, struct fuse_file_info *fi)
+{
+ size_t numread;
+ struct rbd_openimage *rbd;
+
+ if (!gotrados)
+ return -ENXIO;
+
+ rbd = &opentbl[fi->fh];
+ numread = 0;
+ while (size > 0) {
+ ssize_t ret;
+
+ ret = rbd_read(rbd->image, offset, size, buf);
+
+ if (ret <= 0)
+ break;
+ buf += ret;
+ size -= ret;
+ offset += ret;
+ numread += ret;
+ }
+
+ return numread;
+}
+
+static int rbdfs_write(const char *path, const char *buf, size_t size,
+ off_t offset, struct fuse_file_info *fi)
+{
+ size_t numwritten;
+ struct rbd_openimage *rbd;
+
+ if (!gotrados)
+ return -ENXIO;
+
+ rbd = &opentbl[fi->fh];
+ numwritten = 0;
+ while (size > 0) {
+ ssize_t ret;
+
+ if (offset + size > rbdsize(fi->fh)) {
+ int r;
+ fprintf(stderr, "rbdfs_write resizing %s to 0x%"PRIxMAX"\n",
+ path, offset+size);
+ r = rbd_resize(rbd->image, offset+size);
+ if (r < 0)
+ return r;
+
+ r = rbd_stat(rbd->image, &(rbd->rbd_stat.rbd_info),
+ sizeof(rbd_image_info_t));
+ if (r < 0)
+ return r;
+ }
+ ret = rbd_write(rbd->image, offset, size, buf);
+
+ if (ret < 0)
+ break;
+ buf += ret;
+ size -= ret;
+ offset += ret;
+ numwritten += ret;
+ }
+
+ return numwritten;
+}
+
+static void rbdfs_statfs_image_cb(void *num, const char *image)
+{
+ int fd;
+
+ ((uint64_t *)num)[0]++;
+
+ fd = open_rbd_image(image);
+ if (fd >= 0)
+ ((uint64_t *)num)[1] += rbdsize(fd);
+}
+
+static int rbdfs_statfs(const char *path, struct statvfs *buf)
+{
+ uint64_t num[2];
+
+ if (!gotrados)
+ return -ENXIO;
+
+ num[0] = 1;
+ num[1] = 0;
+ pthread_mutex_lock(&readdir_lock);
+ enumerate_images(&rbd_images);
+ pthread_mutex_unlock(&readdir_lock);
+ iter_images(num, rbdfs_statfs_image_cb);
+
+#define RBDFS_BSIZE 4096
+ buf->f_bsize = RBDFS_BSIZE;
+ buf->f_frsize = RBDFS_BSIZE;
+ buf->f_blocks = num[1] / RBDFS_BSIZE;
+ buf->f_bfree = 0;
+ buf->f_bavail = 0;
+ buf->f_files = num[0];
+ buf->f_ffree = 0;
+ buf->f_favail = 0;
+ buf->f_fsid = 0;
+ buf->f_flag = 0;
+ buf->f_namemax = PATH_MAX;
+
+ return 0;
+}
+
+static int rbdfs_fsync(const char *path, int datasync,
+ struct fuse_file_info *fi)
+{
+ if (!gotrados)
+ return -ENXIO;
+ rbd_flush(opentbl[fi->fh].image);
+ return 0;
+}
+
+static int rbdfs_opendir(const char *path, struct fuse_file_info *fi)
+{
+ // only one directory, so global "in_opendir" flag should be fine
+ pthread_mutex_lock(&readdir_lock);
+ in_opendir++;
+ enumerate_images(&rbd_images);
+ pthread_mutex_unlock(&readdir_lock);
+ return 0;
+}
+
+struct rbdfs_readdir_info {
+ void *buf;
+ fuse_fill_dir_t filler;
+};
+
+static void rbdfs_readdir_cb(void *_info, const char *name)
+{
+ struct rbdfs_readdir_info *info = _info;
+
+ info->filler(info->buf, name, NULL, 0);
+}
+
+static int rbdfs_readdir(const char *path, void *buf, fuse_fill_dir_t filler,
+ off_t offset, struct fuse_file_info *fi)
+{
+ struct rbdfs_readdir_info info = { buf, filler };
+
+ if (!gotrados)
+ return -ENXIO;
+ if (!in_opendir)
+ fprintf(stderr, "in readdir, but not inside opendir?\n");
+
+ if (strcmp(path, "/") != 0)
+ return -ENOENT;
+
+ filler(buf, ".", NULL, 0);
+ filler(buf, "..", NULL, 0);
+ iter_images(&info, rbdfs_readdir_cb);
+
+ return 0;
+}
+static int rbdfs_releasedir(const char *path, struct fuse_file_info *fi)
+{
+ // see opendir comments
+ pthread_mutex_lock(&readdir_lock);
+ in_opendir--;
+ pthread_mutex_unlock(&readdir_lock);
+ return 0;
+}
+
+void *
+rbdfs_init(struct fuse_conn_info *conn)
+{
+ int ret;
+
+ // init cannot fail, so if we fail here, gotrados remains at 0,
+ // causing other operations to fail immediately with ENXIO
+
+ ret = connect_to_cluster(&cluster);
+ if (ret < 0)
+ exit(90);
+
+ pool_name = rbd_options.pool_name;
+ ret = rados_ioctx_create(cluster, pool_name, &ioctx);
+ if (ret < 0)
+ exit(91);
+#if FUSE_VERSION >= FUSE_MAKE_VERSION(2, 8)
+ conn->want |= FUSE_CAP_BIG_WRITES;
+#endif
+ gotrados = 1;
+
+ // init's return value shows up in fuse_context.private_data,
+ // also to void (*destroy)(void *); useful?
+ return NULL;
+}
+
+// return -errno on error. fi->fh is not set until open time
+
+int
+rbdfs_create(const char *path, mode_t mode, struct fuse_file_info *fi)
+{
+ int r;
+ int order = imageorder;
+
+ r = rbd_create2(ioctx, path+1, imagesize, imagefeatures, &order);
+ return r;
+}
+
+int
+rbdfs_utime(const char *path, struct utimbuf *utime)
+{
+ // called on create; not relevant
+ return 0;
+}
+
+int
+rbdfs_unlink(const char *path)
+{
+ int fd = find_openrbd(path);
+ if (fd != -1) {
+ struct rbd_openimage *rbd = &opentbl[fd];
+ rbd_close(rbd->image);
+ rbd->image = 0;
+ free(rbd->image_name);
+ rbd->rbd_stat.valid = 0;
+ }
+ return rbd_remove(ioctx, path+1);
+}
+
+
+int
+rbdfs_truncate(const char *path, off_t size)
+{
+ int fd;
+ int r;
+ struct rbd_openimage *rbd;
+
+ if ((fd = open_rbd_image(path+1)) < 0)
+ return -ENOENT;
+
+ rbd = &opentbl[fd];
+ fprintf(stderr, "truncate %s to %"PRIdMAX" (0x%"PRIxMAX")\n", path, size, size);
+ r = rbd_resize(rbd->image, size);
+ if (r < 0)
+ return r;
+
+ r = rbd_stat(rbd->image, &(rbd->rbd_stat.rbd_info),
+ sizeof(rbd_image_info_t));
+ if (r < 0)
+ return r;
+ return 0;
+}
+
+/**
+ * set an xattr on path, with name/value, length size.
+ * Presumably flags are from Linux, as in XATTR_CREATE or
+ * XATTR_REPLACE (both "set", but fail if exist vs fail if not exist.
+ *
+ * We accept xattrs only on the root node.
+ *
+ * All values converted with strtoull, so can be expressed in any base
+ */
+
+struct rbdfuse_attr {
+ char *attrname;
+ uint64_t *attrvalp;
+} attrs[] = {
+ { "user.rbdfuse.imagesize", &imagesize },
+ { "user.rbdfuse.imageorder", &imageorder },
+ { "user.rbdfuse.imagefeatures", &imagefeatures },
+ { NULL }
+};
+
+int
+rbdfs_setxattr(const char *path, const char *name, const char *value,
+ size_t size, int flags)
+{
+ struct rbdfuse_attr *ap;
+ if (strcmp(path, "/") != 0)
+ return -EINVAL;
+
+ for (ap = attrs; ap->attrname != NULL; ap++) {
+ if (strcmp(name, ap->attrname) == 0) {
+ *ap->attrvalp = strtoull(value, NULL, 0);
+ fprintf(stderr, "rbd-fuse: %s set to 0x%"PRIx64"\n",
+ ap->attrname, *ap->attrvalp);
+ return 0;
+ }
+ }
+ return -EINVAL;
+}
+
+int
+rbdfs_getxattr(const char *path, const char *name, char *value,
+ size_t size)
+{
+ struct rbdfuse_attr *ap;
+ char buf[128];
+ // allow gets on other files; ls likes to ask for things like
+ // security.*
+
+ for (ap = attrs; ap->attrname != NULL; ap++) {
+ if (strcmp(name, ap->attrname) == 0) {
+ sprintf(buf, "%"PRIu64, *ap->attrvalp);
+ if (value != NULL && size >= strlen(buf))
+ strcpy(value, buf);
+ fprintf(stderr, "rbd-fuse: get %s\n", ap->attrname);
+ return (strlen(buf));
+ }
+ }
+ return 0;
+}
+
+int
+rbdfs_listxattr(const char *path, char *list, size_t len)
+{
+ struct rbdfuse_attr *ap;
+ int required_len = 0;
+
+ if (strcmp(path, "/") != 0)
+ return -EINVAL;
+
+ for (ap = attrs; ap->attrname != NULL; ap++)
+ required_len += strlen(ap->attrname) + 1;
+ if (len >= required_len) {
+ for (ap = attrs; ap->attrname != NULL; ap++) {
+ sprintf(list, "%s", ap->attrname);
+ list += strlen(ap->attrname) + 1;
+ }
+ }
+ return required_len;
+}
+
+static struct fuse_operations rbdfs_oper = {
+ .create = rbdfs_create,
+ .fsync = rbdfs_fsync,
+ .getattr = rbdfs_getattr,
+ .getxattr = rbdfs_getxattr,
+ .init = rbdfs_init,
+ .listxattr = rbdfs_listxattr,
+ .open = rbdfs_open,
+ .opendir = rbdfs_opendir,
+ .read = rbdfs_read,
+ .readdir = rbdfs_readdir,
+ .releasedir = rbdfs_releasedir,
+ .setxattr = rbdfs_setxattr,
+ .statfs = rbdfs_statfs,
+ .truncate = rbdfs_truncate,
+ .unlink = rbdfs_unlink,
+ .utime = rbdfs_utime,
+ .write = rbdfs_write,
+};
+
+enum {
+ KEY_HELP,
+ KEY_VERSION,
+ KEY_CEPH_CONFIG,
+ KEY_CEPH_CONFIG_LONG,
+ KEY_RADOS_POOLNAME,
+ KEY_RADOS_POOLNAME_LONG
+};
+
+static struct fuse_opt rbdfs_opts[] = {
+ FUSE_OPT_KEY("-h", KEY_HELP),
+ FUSE_OPT_KEY("--help", KEY_HELP),
+ FUSE_OPT_KEY("-V", KEY_VERSION),
+ FUSE_OPT_KEY("--version", KEY_VERSION),
+ {"-c %s", offsetof(struct rbd_options, ceph_config), KEY_CEPH_CONFIG},
+ {"--configfile=%s", offsetof(struct rbd_options, ceph_config),
+ KEY_CEPH_CONFIG_LONG},
+ {"-p %s", offsetof(struct rbd_options, pool_name), KEY_RADOS_POOLNAME},
+ {"--poolname=%s", offsetof(struct rbd_options, pool_name),
+ KEY_RADOS_POOLNAME_LONG},
+};
+
+static void usage(const char *progname)
+{
+ fprintf(stderr,
+"Usage: %s mountpoint [options]\n"
+"\n"
+"General options:\n"
+" -h --help print help\n"
+" -V --version print version\n"
+" -c --configfile ceph configuration file [/etc/ceph/ceph.conf]\n"
+" -p --poolname rados pool name [rbd]\n"
+"\n", progname);
+}
+
+static int rbdfs_opt_proc(void *data, const char *arg, int key,
+ struct fuse_args *outargs)
+{
+ if (key == KEY_HELP) {
+ usage(outargs->argv[0]);
+ fuse_opt_add_arg(outargs, "-ho");
+ fuse_main(outargs->argc, outargs->argv, &rbdfs_oper, NULL);
+ exit(1);
+ }
+
+ if (key == KEY_VERSION) {
+ fuse_opt_add_arg(outargs, "--version");
+ fuse_main(outargs->argc, outargs->argv, &rbdfs_oper, NULL);
+ exit(0);
+ }
+
+ if (key == KEY_CEPH_CONFIG) {
+ if (rbd_options.ceph_config != NULL) {
+ free(rbd_options.ceph_config);
+ rbd_options.ceph_config = NULL;
+ }
+ rbd_options.ceph_config = strdup(arg+2);
+ return 0;
+ }
+
+ if (key == KEY_RADOS_POOLNAME) {
+ if (rbd_options.pool_name != NULL) {
+ free(rbd_options.pool_name);
+ rbd_options.pool_name = NULL;
+ }
+ rbd_options.pool_name = strdup(arg+2);
+ return 0;
+ }
+
+ return 1;
+}
+
+void
+simple_err(const char *msg, int err)
+{
+ fprintf(stderr, "%s: %s\n", msg, strerror(-err));
+ return;
+}
+
+int
+connect_to_cluster(rados_t *pcluster)
+{
+ int r;
+
+ r = rados_create(pcluster, NULL);
+ if (r < 0) {
+ simple_err("Could not create cluster handle", r);
+ return r;
+ }
+ rados_conf_parse_env(*pcluster, NULL);
+ r = rados_conf_read_file(*pcluster, rbd_options.ceph_config);
+ if (r < 0) {
+ simple_err("Error reading Ceph config file", r);
+ goto failed_shutdown;
+ }
+ r = rados_connect(*pcluster);
+ if (r < 0) {
+ simple_err("Error connecting to cluster", r);
+ goto failed_shutdown;
+ }
+
+ return 0;
+
+failed_shutdown:
+ rados_shutdown(*pcluster);
+ return r;
+}
+
+int main(int argc, char *argv[])
+{
+ struct fuse_args args = FUSE_ARGS_INIT(argc, argv);
+
+ if (fuse_opt_parse(&args, &rbd_options, rbdfs_opts, rbdfs_opt_proc)
+ == -1) {
+ exit(1);
+ }
+
+ pthread_mutex_init(&readdir_lock, NULL);
+
+ return fuse_main(args.argc, args.argv, &rbdfs_oper, NULL);
+}
diff --git a/src/rgw/rgw_rest_s3.cc b/src/rgw/rgw_rest_s3.cc
index b2925940f77..dfa6827c7ff 100644
--- a/src/rgw/rgw_rest_s3.cc
+++ b/src/rgw/rgw_rest_s3.cc
@@ -707,7 +707,7 @@ int RGWPostObj_ObjStore_S3::get_params()
string whitespaces (" \t\f\v\n\r");
// get the part boundary
- string req_content_type_str = s->env->get("CONTENT_TYPE");
+ string req_content_type_str = s->env->get("CONTENT_TYPE", "");
string req_content_type;
map<string, string> params;
diff --git a/src/test/cli/osdmaptool/clobber.t b/src/test/cli/osdmaptool/clobber.t
index 46194db9ffb..9bbe4d4ceeb 100644
--- a/src/test/cli/osdmaptool/clobber.t
+++ b/src/test/cli/osdmaptool/clobber.t
@@ -19,9 +19,9 @@
modified \d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d+ (re)
flags
- pool 0 'data' rep size 2 crush_ruleset 0 object_hash rjenkins pg_num 192 pgp_num 192 last_change 0 owner 0 crash_replay_interval 45
- pool 1 'metadata' rep size 2 crush_ruleset 1 object_hash rjenkins pg_num 192 pgp_num 192 last_change 0 owner 0
- pool 2 'rbd' rep size 2 crush_ruleset 2 object_hash rjenkins pg_num 192 pgp_num 192 last_change 0 owner 0
+ pool 0 'data' rep size 2 min_size 1 crush_ruleset 0 object_hash rjenkins pg_num 192 pgp_num 192 last_change 0 owner 0 crash_replay_interval 45
+ pool 1 'metadata' rep size 2 min_size 1 crush_ruleset 1 object_hash rjenkins pg_num 192 pgp_num 192 last_change 0 owner 0
+ pool 2 'rbd' rep size 2 min_size 1 crush_ruleset 2 object_hash rjenkins pg_num 192 pgp_num 192 last_change 0 owner 0
max_osd 3
@@ -41,9 +41,9 @@
modified \d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d+ (re)
flags
- pool 0 'data' rep size 2 crush_ruleset 0 object_hash rjenkins pg_num 64 pgp_num 64 last_change 0 owner 0 crash_replay_interval 45
- pool 1 'metadata' rep size 2 crush_ruleset 1 object_hash rjenkins pg_num 64 pgp_num 64 last_change 0 owner 0
- pool 2 'rbd' rep size 2 crush_ruleset 2 object_hash rjenkins pg_num 64 pgp_num 64 last_change 0 owner 0
+ pool 0 'data' rep size 2 min_size 1 crush_ruleset 0 object_hash rjenkins pg_num 64 pgp_num 64 last_change 0 owner 0 crash_replay_interval 45
+ pool 1 'metadata' rep size 2 min_size 1 crush_ruleset 1 object_hash rjenkins pg_num 64 pgp_num 64 last_change 0 owner 0
+ pool 2 'rbd' rep size 2 min_size 1 crush_ruleset 2 object_hash rjenkins pg_num 64 pgp_num 64 last_change 0 owner 0
max_osd 1
diff --git a/src/test/cli/osdmaptool/create-print.t b/src/test/cli/osdmaptool/create-print.t
index a01d27d69fa..81b91947359 100644
--- a/src/test/cli/osdmaptool/create-print.t
+++ b/src/test/cli/osdmaptool/create-print.t
@@ -10,9 +10,9 @@
modified \d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d+ (re)
flags
- pool 0 'data' rep size 2 crush_ruleset 0 object_hash rjenkins pg_num 192 pgp_num 192 last_change 0 owner 0 crash_replay_interval 45
- pool 1 'metadata' rep size 2 crush_ruleset 1 object_hash rjenkins pg_num 192 pgp_num 192 last_change 0 owner 0
- pool 2 'rbd' rep size 2 crush_ruleset 2 object_hash rjenkins pg_num 192 pgp_num 192 last_change 0 owner 0
+ pool 0 'data' rep size 2 min_size 1 crush_ruleset 0 object_hash rjenkins pg_num 192 pgp_num 192 last_change 0 owner 0 crash_replay_interval 45
+ pool 1 'metadata' rep size 2 min_size 1 crush_ruleset 1 object_hash rjenkins pg_num 192 pgp_num 192 last_change 0 owner 0
+ pool 2 'rbd' rep size 2 min_size 1 crush_ruleset 2 object_hash rjenkins pg_num 192 pgp_num 192 last_change 0 owner 0
max_osd 3