diff options
-rw-r--r-- | configure.ac | 18 | ||||
-rw-r--r-- | doc/radosgw/s3/php.rst | 7 | ||||
-rw-r--r-- | doc/rbd/rados-rbd-cmds.rst | 7 | ||||
-rw-r--r-- | src/.gitignore | 1 | ||||
-rw-r--r-- | src/Makefile.am | 5 | ||||
-rw-r--r-- | src/common/config_opts.h | 1 | ||||
-rw-r--r-- | src/crush/CrushWrapper.cc | 37 | ||||
-rw-r--r-- | src/crush/CrushWrapper.h | 8 | ||||
-rw-r--r-- | src/include/cephfs/libcephfs.h | 2 | ||||
-rw-r--r-- | src/include/utime.h | 2 | ||||
-rw-r--r-- | src/java/java/com/ceph/fs/CephMount.java | 51 | ||||
-rw-r--r-- | src/java/native/libcephfs_jni.cc | 177 | ||||
-rw-r--r-- | src/java/test/com/ceph/fs/CephMountTest.java | 83 | ||||
-rw-r--r-- | src/java/test/com/ceph/fs/CephUnmountedTest.java | 16 | ||||
-rw-r--r-- | src/mon/Elector.cc | 2 | ||||
-rw-r--r-- | src/mon/Monitor.cc | 116 | ||||
-rw-r--r-- | src/mon/Monitor.h | 8 | ||||
-rw-r--r-- | src/mon/OSDMonitor.cc | 16 | ||||
-rw-r--r-- | src/osd/OSDMap.cc | 62 | ||||
-rw-r--r-- | src/osd/OSDMap.h | 6 | ||||
-rw-r--r-- | src/osd/osd_types.cc | 2 | ||||
-rwxr-xr-x | src/ps-ceph.pl | 1 | ||||
-rw-r--r-- | src/rbd.cc | 36 | ||||
-rw-r--r-- | src/rbd_fuse/rbd-fuse.c | 752 | ||||
-rw-r--r-- | src/rgw/rgw_rest_s3.cc | 2 | ||||
-rw-r--r-- | src/test/cli/osdmaptool/clobber.t | 12 | ||||
-rw-r--r-- | src/test/cli/osdmaptool/create-print.t | 6 |
27 files changed, 1353 insertions, 83 deletions
diff --git a/configure.ac b/configure.ac index b67e5cd39c3..cb49a0b2fe1 100644 --- a/configure.ac +++ b/configure.ac @@ -1,6 +1,5 @@ # -*- Autoconf -*- # Process this file with autoconf to produce a configure script. -m4_include(m4/acx_pthread.m4) # Autoconf AC_PREREQ(2.59) @@ -12,8 +11,15 @@ AC_PREREQ(2.59) AC_INIT([ceph], [0.56], [ceph-devel@vger.kernel.org]) # Create release string. Used with VERSION for RPMs. +RPM_RELEASE=0 AC_SUBST(RPM_RELEASE) -RPM_RELEASE=`if expr index $(git describe --always) '-' > /dev/null ; then git describe --always | cut -d- -f2- | tr '-' '.' ; else echo "0"; fi` +if test -d ".git" ; then + AC_CHECK_PROG(GIT_CHECK, git, yes) + if test x"$GIT_CHECK" = x"yes"; then + RPM_RELEASE=`if expr index $(git describe --always) '-' > /dev/null ; then git describe --always | cut -d- -f2- | tr '-' '.' ; else echo "0"; fi` + fi +fi +AC_MSG_NOTICE([RPM_RELEASE='$RPM_RELEASE']) AC_CONFIG_MACRO_DIR([m4]) @@ -235,6 +241,8 @@ AS_IF([test "x$with_fuse" != xno], AC_DEFINE([HAVE_LIBFUSE], [1], [Define if you have fuse]) HAVE_LIBFUSE=1 + # look for fuse_getgroups and define FUSE_GETGROUPS if found + AC_CHECK_FUNCS([fuse_getgroups]) ], [AC_MSG_FAILURE( [no FUSE found (use --without-fuse to disable)])])]) @@ -385,7 +393,8 @@ AS_IF([test "x$with_libatomic_ops" != xno], ])]) AS_IF([test "$HAVE_ATOMIC_OPS" = "1"], [], - AC_DEFINE([NO_ATOMIC_OPS], [1], [Defined if you don't have atomic_ops])) + [AC_DEFINE([NO_ATOMIC_OPS], [1], [Defined if you do not have atomic_ops])]) + AM_CONDITIONAL(WITH_LIBATOMIC, [test "$HAVE_ATOMIC_OPS" = "1"]) # newsyn? requires mpi. @@ -411,9 +420,6 @@ AS_IF([test "x$with_system_leveldb" = xcheck], [AC_CHECK_LIB([leveldb], [leveldb_open], [with_system_leveldb=yes], [], [-lsnappy -lpthread])]) AM_CONDITIONAL(WITH_SYSTEM_LEVELDB, [ test "$with_system_leveldb" = "yes" ]) -# look for fuse_getgroups and define FUSE_GETGROUPS if found -AC_CHECK_FUNCS([fuse_getgroups]) - # use system libs3? AC_ARG_WITH([system-libs3], [AS_HELP_STRING([--with-system-libs3], [use system libs3])], diff --git a/doc/radosgw/s3/php.rst b/doc/radosgw/s3/php.rst index fc3087e54d4..40542e072a2 100644 --- a/doc/radosgw/s3/php.rst +++ b/doc/radosgw/s3/php.rst @@ -21,7 +21,12 @@ This creates a connection so that you can interact with the server. require_once 'AWSSDKforPHP/sdk.class.php'; // Instantiate the S3 class and point it at the desired host - $Connection = new AmazonS3(); + $Connection = new AmazonS3(array( + 'key' => AWS_KEY, + 'secret' => AWS_SECRET_KEY, + 'canonical_id' => AWS_CANONICAL_ID, + 'canonical_name' => AWS_CANONICAL_NAME, + )); $Connection->set_hostname($HOST); $Connection->allow_hostname_override(false); diff --git a/doc/rbd/rados-rbd-cmds.rst b/doc/rbd/rados-rbd-cmds.rst index 6e28a6ab713..eec58512edc 100644 --- a/doc/rbd/rados-rbd-cmds.rst +++ b/doc/rbd/rados-rbd-cmds.rst @@ -17,13 +17,13 @@ Before you can add a block device to a Ceph client, you must create an image for it in the OSD cluster first. To create a block device image, execute the following:: - rbd create {image-name} --size {megabytes} --dest-pool {pool-name} + rbd create {image-name} --size {megabytes} --pool {pool-name} For example, to create a 1GB image named ``foo`` that stores information in a pool named ``swimmingpool``, execute the following:: rbd create foo --size 1024 - rbd create bar --size 1024 --pool swimmingpool + rbd create bar --size 1024 --pool swimmingpool .. note:: You must create a pool first before you can specify it as a source. See `Storage Pools`_ for details. @@ -99,8 +99,7 @@ For example:: rbd rm bar -p swimmingpool - .. _Storage Pools: ../../rados/operations/pools -.. _RBD – Manage RADOS Block Device (RBD) Images: ../../man/8/rbd/
\ No newline at end of file +.. _RBD – Manage RADOS Block Device (RBD) Images: ../../man/8/rbd/ diff --git a/src/.gitignore b/src/.gitignore index 7548b5e47ae..d3cab1a4d1f 100644 --- a/src/.gitignore +++ b/src/.gitignore @@ -54,6 +54,7 @@ /fsconverter /xattr_bench /rest-bench +/rbd-fuse dev mondata mnt diff --git a/src/Makefile.am b/src/Makefile.am index 82d585c4507..c30c0c1a705 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -161,6 +161,11 @@ ceph_fuse_LDADD = -lfuse libclient.la $(LIBGLOBAL_LDA) ceph_fuse_CXXFLAGS = ${AM_CXXFLAGS} bin_PROGRAMS += ceph-fuse +rbd_fuse_SOURCES = rbd_fuse/rbd-fuse.c +rbd_fuse_LDADD = -lfuse librados.la librbd.la $(LIBGLOBAL_LDA) +rbd_fuse_CXXFLAGS = ${AM_CXXFLAGS} +bin_PROGRAMS += rbd-fuse + endif # tcmalloc? diff --git a/src/common/config_opts.h b/src/common/config_opts.h index 59caca5a6a2..a778268d51a 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -127,6 +127,7 @@ OPTION(mon_osd_auto_mark_in, OPT_BOOL, false) // mark any booting osds ' OPTION(mon_osd_auto_mark_auto_out_in, OPT_BOOL, true) // mark booting auto-marked-out osds 'in' OPTION(mon_osd_auto_mark_new_in, OPT_BOOL, true) // mark booting new osds 'in' OPTION(mon_osd_down_out_interval, OPT_INT, 300) // seconds +OPTION(mon_osd_down_out_subtree_limit, OPT_STR, "rack") // largest crush unit/type that we will automatically mark out OPTION(mon_osd_min_up_ratio, OPT_DOUBLE, .3) // min osds required to be up to mark things down OPTION(mon_osd_min_in_ratio, OPT_DOUBLE, .3) // min osds required to be in to mark things out OPTION(mon_lease, OPT_FLOAT, 5) // lease interval diff --git a/src/crush/CrushWrapper.cc b/src/crush/CrushWrapper.cc index 3bae96c8689..45e4fb53de6 100644 --- a/src/crush/CrushWrapper.cc +++ b/src/crush/CrushWrapper.cc @@ -202,6 +202,23 @@ map<int, string> CrushWrapper::get_parent_hierarchy(int id) return parent_hierarchy; } +int CrushWrapper::get_children(int id, list<int> *children) +{ + // leaf? + if (id >= 0) { + return 0; + } + + crush_bucket *b = get_bucket(id); + if (!b) { + return -ENOENT; + } + + for (unsigned n=0; n<b->size; n++) { + children->push_back(b->items[n]); + } + return b->size; +} int CrushWrapper::insert_item(CephContext *cct, int item, float weight, string name, @@ -426,24 +443,36 @@ pair<string,string> CrushWrapper::get_immediate_parent(int id) { pair <string, string> loc; - for (int bidx = 0; bidx < crush->max_buckets; bidx++) { crush_bucket *b = crush->buckets[bidx]; if (b == 0) continue; for (unsigned i = 0; i < b->size; i++) - if (b->items[i] == id){ + if (b->items[i] == id) { string parent_id = name_map[b->id]; string parent_bucket_type = type_map[b->type]; loc = make_pair(parent_bucket_type, parent_id); } } - return loc; } - +int CrushWrapper::get_immediate_parent_id(int id, int *parent) +{ + for (int bidx = 0; bidx < crush->max_buckets; bidx++) { + crush_bucket *b = crush->buckets[bidx]; + if (b == 0) + continue; + for (unsigned i = 0; i < b->size; i++) { + if (b->items[i] == id) { + *parent = b->id; + return 0; + } + } + } + return -ENOENT; +} void CrushWrapper::reweight(CephContext *cct) { diff --git a/src/crush/CrushWrapper.h b/src/crush/CrushWrapper.h index 56bcb598ff3..7def6e4ab34 100644 --- a/src/crush/CrushWrapper.h +++ b/src/crush/CrushWrapper.h @@ -284,6 +284,7 @@ public: * returns the (type, name) of the parent bucket of id */ pair<string,string> get_immediate_parent(int id); + int get_immediate_parent_id(int id, int *parent); /** * get the fully qualified location of a device by successively finding @@ -302,6 +303,13 @@ public: */ map<int, string> get_parent_hierarchy(int id); + /** + * enumerate immediate children of given node + * + * @param id parent bucket or device id + * @return number of items, or error + */ + int get_children(int id, list<int> *children); /** * insert an item into the map at a specific position diff --git a/src/include/cephfs/libcephfs.h b/src/include/cephfs/libcephfs.h index 63e9233d9da..7b04cce4270 100644 --- a/src/include/cephfs/libcephfs.h +++ b/src/include/cephfs/libcephfs.h @@ -851,7 +851,7 @@ int ceph_get_file_pool(struct ceph_mount_info *cmount, int fh); * @param fh the open file descriptor referring to the file * @param buf buffer to store the name in * @param buflen size of the buffer - * @returns length in bytes of the pool name + * @returns length in bytes of the pool name, or -ERANGE if the buffer is not large enough. */ int ceph_get_file_pool_name(struct ceph_mount_info *cmount, int fh, char *buf, size_t buflen); diff --git a/src/include/utime.h b/src/include/utime.h index 526dec568ae..f433fff4467 100644 --- a/src/include/utime.h +++ b/src/include/utime.h @@ -136,7 +136,7 @@ public: } void sleep() { - struct timespec ts = { tv.tv_sec, tv.tv_nsec }; + struct timespec ts = { (__time_t)tv.tv_sec, (long)tv.tv_nsec }; nanosleep(&ts, &ts); } diff --git a/src/java/java/com/ceph/fs/CephMount.java b/src/java/java/com/ceph/fs/CephMount.java index 10036f6c768..6051cf635d0 100644 --- a/src/java/java/com/ceph/fs/CephMount.java +++ b/src/java/java/com/ceph/fs/CephMount.java @@ -432,6 +432,23 @@ public class CephMount { * @param path Path of file to stat. * @param stat CephStat structure to hold file status. */ + public void stat(String path, CephStat stat) throws FileNotFoundException, CephNotDirectoryException { + rlock.lock(); + try { + native_ceph_stat(instance_ptr, path, stat); + } finally { + rlock.unlock(); + } + } + + private static native int native_ceph_stat(long mountp, String path, CephStat stat); + + /** + * Get file status, without following symlinks. + * + * @param path Path of file to stat. + * @param stat CephStat structure to hold file status. + */ public void lstat(String path, CephStat stat) throws FileNotFoundException, CephNotDirectoryException { rlock.lock(); try { @@ -479,6 +496,23 @@ public class CephMount { private static native int native_ceph_chmod(long mountp, String path, int mode); /** + * Change file mode of an open file. + * + * @param fd The open file descriptor to change the mode bits on. + * @param mode New mode bits. + */ + public void fchmod(int fd, int mode) { + rlock.lock(); + try { + native_ceph_fchmod(instance_ptr, fd, mode); + } finally { + rlock.unlock(); + } + } + + private static native int native_ceph_fchmod(long mountp, int fd, int mode); + + /** * Truncate a file to a specified length. * * @param path Path of the file. @@ -852,6 +886,23 @@ public class CephMount { private static native int native_ceph_get_file_stripe_unit(long mountp, int fd); /** + * Get the name of the pool a file is stored in. + * + * @param fd An open file descriptor. + * @return The pool name. + */ + public String get_file_pool_name(int fd) { + rlock.lock(); + try { + return native_ceph_get_file_pool_name(instance_ptr, fd); + } finally { + rlock.unlock(); + } + } + + private static native String native_ceph_get_file_pool_name(long mountp, int fd); + + /** * Get the replication of a file. * * @param fd The file descriptor. diff --git a/src/java/native/libcephfs_jni.cc b/src/java/native/libcephfs_jni.cc index 2fef171a976..d5189fbf8da 100644 --- a/src/java/native/libcephfs_jni.cc +++ b/src/java/native/libcephfs_jni.cc @@ -173,7 +173,7 @@ static void cephThrowNullArg(JNIEnv *env, const char *msg) static void cephThrowOutOfMemory(JNIEnv *env, const char *msg) { - THROW(env, "java/lang/OutOfMemoryException", msg); + THROW(env, "java/lang/OutOfMemoryError", msg); } static void cephThrowInternal(JNIEnv *env, const char *msg) @@ -1189,6 +1189,35 @@ JNIEXPORT jint JNICALL Java_com_ceph_fs_CephMount_native_1ceph_1symlink return ret; } +static void fill_cephstat(JNIEnv *env, jobject j_cephstat, struct stat *st) +{ + env->SetIntField(j_cephstat, cephstat_mode_fid, st->st_mode); + env->SetIntField(j_cephstat, cephstat_uid_fid, st->st_uid); + env->SetIntField(j_cephstat, cephstat_gid_fid, st->st_gid); + env->SetLongField(j_cephstat, cephstat_size_fid, st->st_size); + env->SetLongField(j_cephstat, cephstat_blksize_fid, st->st_blksize); + env->SetLongField(j_cephstat, cephstat_blocks_fid, st->st_blocks); + + long long time = st->st_mtim.tv_sec; + time *= 1000; + time += st->st_mtim.tv_nsec / 1000000; + env->SetLongField(j_cephstat, cephstat_m_time_fid, time); + + time = st->st_atim.tv_sec; + time *= 1000; + time += st->st_atim.tv_nsec / 1000000; + env->SetLongField(j_cephstat, cephstat_a_time_fid, time); + + env->SetBooleanField(j_cephstat, cephstat_is_file_fid, + S_ISREG(st->st_mode) ? JNI_TRUE : JNI_FALSE); + + env->SetBooleanField(j_cephstat, cephstat_is_directory_fid, + S_ISDIR(st->st_mode) ? JNI_TRUE : JNI_FALSE); + + env->SetBooleanField(j_cephstat, cephstat_is_symlink_fid, + S_ISLNK(st->st_mode) ? JNI_TRUE : JNI_FALSE); +} + /* * Class: com_ceph_fs_CephMount * Method: native_ceph_lstat @@ -1200,7 +1229,6 @@ JNIEXPORT jint JNICALL Java_com_ceph_fs_CephMount_native_1ceph_1lstat struct ceph_mount_info *cmount = get_ceph_mount(j_mntp); CephContext *cct = ceph_get_mount_context(cmount); const char *c_path; - long long time; struct stat st; int ret; @@ -1227,31 +1255,49 @@ JNIEXPORT jint JNICALL Java_com_ceph_fs_CephMount_native_1ceph_1lstat return ret; } - env->SetIntField(j_cephstat, cephstat_mode_fid, st.st_mode); - env->SetIntField(j_cephstat, cephstat_uid_fid, st.st_uid); - env->SetIntField(j_cephstat, cephstat_gid_fid, st.st_gid); - env->SetLongField(j_cephstat, cephstat_size_fid, st.st_size); - env->SetLongField(j_cephstat, cephstat_blksize_fid, st.st_blksize); - env->SetLongField(j_cephstat, cephstat_blocks_fid, st.st_blocks); + fill_cephstat(env, j_cephstat, &st); - time = st.st_mtim.tv_sec; - time *= 1000; - time += st.st_mtim.tv_nsec / 1000000; - env->SetLongField(j_cephstat, cephstat_m_time_fid, time); + return ret; +} - time = st.st_atim.tv_sec; - time *= 1000; - time += st.st_atim.tv_nsec / 1000000; - env->SetLongField(j_cephstat, cephstat_a_time_fid, time); +/* + * Class: com_ceph_fs_CephMount + * Method: native_ceph_stat + * Signature: (JLjava/lang/String;Lcom/ceph/fs/CephStat;)I + */ +JNIEXPORT jint JNICALL Java_com_ceph_fs_CephMount_native_1ceph_1stat + (JNIEnv *env, jclass clz, jlong j_mntp, jstring j_path, jobject j_cephstat) +{ + struct ceph_mount_info *cmount = get_ceph_mount(j_mntp); + CephContext *cct = ceph_get_mount_context(cmount); + const char *c_path; + struct stat st; + int ret; - env->SetBooleanField(j_cephstat, cephstat_is_file_fid, - S_ISREG(st.st_mode) ? JNI_TRUE : JNI_FALSE); + CHECK_ARG_NULL(j_path, "@path is null", -1); + CHECK_ARG_NULL(j_cephstat, "@stat is null", -1); + CHECK_MOUNTED(cmount, -1); - env->SetBooleanField(j_cephstat, cephstat_is_directory_fid, - S_ISDIR(st.st_mode) ? JNI_TRUE : JNI_FALSE); + c_path = env->GetStringUTFChars(j_path, NULL); + if (!c_path) { + cephThrowInternal(env, "Failed to pin memory"); + return -1; + } - env->SetBooleanField(j_cephstat, cephstat_is_symlink_fid, - S_ISLNK(st.st_mode) ? JNI_TRUE : JNI_FALSE); + ldout(cct, 10) << "jni: lstat: path " << c_path << dendl; + + ret = ceph_stat(cmount, c_path, &st); + + ldout(cct, 10) << "jni: lstat exit ret " << ret << dendl; + + env->ReleaseStringUTFChars(j_path, c_path); + + if (ret) { + handle_error(env, ret); + return ret; + } + + fill_cephstat(env, j_cephstat, &st); return ret; } @@ -1340,6 +1386,32 @@ JNIEXPORT jint JNICALL Java_com_ceph_fs_CephMount_native_1ceph_1chmod /* * Class: com_ceph_fs_CephMount + * Method: native_ceph_fchmod + * Signature: (JII)I + */ +JNIEXPORT jint JNICALL Java_com_ceph_fs_CephMount_native_1ceph_1fchmod + (JNIEnv *env, jclass clz, jlong j_mntp, jint j_fd, jint j_mode) +{ + struct ceph_mount_info *cmount = get_ceph_mount(j_mntp); + CephContext *cct = ceph_get_mount_context(cmount); + int ret; + + CHECK_MOUNTED(cmount, -1); + + ldout(cct, 10) << "jni: fchmod: fd " << (int)j_fd << " mode " << (int)j_mode << dendl; + + ret = ceph_fchmod(cmount, (int)j_fd, (int)j_mode); + + ldout(cct, 10) << "jni: fchmod: exit ret " << ret << dendl; + + if (ret) + handle_error(env, ret); + + return ret; +} + +/* + * Class: com_ceph_fs_CephMount * Method: native_ceph_truncate * Signature: (JLjava/lang/String;J)I */ @@ -2385,6 +2457,67 @@ JNIEXPORT jint JNICALL Java_com_ceph_fs_CephMount_native_1ceph_1get_1file_1repli /* * Class: com_ceph_fs_CephMount + * Method: native_ceph_get_file_pool_name + * Signature: (JI)Ljava/lang/String; + */ +JNIEXPORT jstring JNICALL Java_com_ceph_fs_CephMount_native_1ceph_1get_1file_1pool_1name + (JNIEnv *env, jclass clz, jlong j_mntp, jint j_fd) +{ + struct ceph_mount_info *cmount = get_ceph_mount(j_mntp); + CephContext *cct = ceph_get_mount_context(cmount); + jstring pool = NULL; + int ret, buflen = 0; + char *buf = NULL; + + CHECK_MOUNTED(cmount, NULL); + + ldout(cct, 10) << "jni: get_file_pool_name: fd " << (int)j_fd << dendl; + + for (;;) { + /* get pool name length (len==0) */ + ret = ceph_get_file_pool_name(cmount, (int)j_fd, NULL, 0); + if (ret < 0) + break; + + /* allocate buffer */ + if (buf) + delete [] buf; + buflen = ret; + buf = new (std::nothrow) char[buflen+1]; /* +1 for '\0' */ + if (!buf) { + cephThrowOutOfMemory(env, "head allocation failed"); + goto out; + } + memset(buf, 0, (buflen+1)*sizeof(*buf)); + + /* handle zero-length pool name!? */ + if (buflen == 0) + break; + + /* fill buffer */ + ret = ceph_get_file_pool_name(cmount, (int)j_fd, buf, buflen); + if (ret == -ERANGE) /* size changed! */ + continue; + else + break; + } + + ldout(cct, 10) << "jni: get_file_pool_name: ret " << ret << dendl; + + if (ret < 0) + handle_error(env, ret); + else + pool = env->NewStringUTF(buf); + +out: + if (buf) + delete [] buf; + + return pool; +} + +/* + * Class: com_ceph_fs_CephMount * Method: native_ceph_localize_reads * Signature: (JZ)I */ diff --git a/src/java/test/com/ceph/fs/CephMountTest.java b/src/java/test/com/ceph/fs/CephMountTest.java index 984c2cb7377..9d205121cc5 100644 --- a/src/java/test/com/ceph/fs/CephMountTest.java +++ b/src/java/test/com/ceph/fs/CephMountTest.java @@ -451,7 +451,9 @@ public class CephMountTest { } /* - * test_stat covers lstat and fstat + * test_stat covers lstat and fstat and stat. + * + * TODO: create test that for lstat vs stat with symlink follow/nofollow. */ @Test @@ -469,6 +471,10 @@ public class CephMountTest { assertTrue(orig_st.blksize > 0); assertTrue(orig_st.blocks > 0); + /* now try stat */ + CephStat stat_st = new CephStat(); + mount.stat(path, stat_st); + /* now try fstat */ CephStat other_st = new CephStat(); fd = mount.open(path, CephMount.O_RDWR, 0); @@ -477,12 +483,40 @@ public class CephMountTest { mount.unlink(path); + /* compare to fstat results */ assertTrue(orig_st.mode == other_st.mode); assertTrue(orig_st.uid == other_st.uid); assertTrue(orig_st.gid == other_st.gid); assertTrue(orig_st.size == other_st.size); assertTrue(orig_st.blksize == other_st.blksize); assertTrue(orig_st.blocks == other_st.blocks); + + /* compare to stat results */ + assertTrue(orig_st.mode == stat_st.mode); + assertTrue(orig_st.uid == stat_st.uid); + assertTrue(orig_st.gid == stat_st.gid); + assertTrue(orig_st.size == stat_st.size); + assertTrue(orig_st.blksize == stat_st.blksize); + assertTrue(orig_st.blocks == stat_st.blocks); + } + + /* + * stat + */ + + @Test(expected=NullPointerException.class) + public void test_stat_null_path() throws Exception { + mount.stat(null, new CephStat()); + } + + @Test(expected=NullPointerException.class) + public void test_stat_null_stat() throws Exception { + mount.stat("/path", null); + } + + @Test(expected=FileNotFoundException.class) + public void test_stat_null_dne() throws Exception { + mount.stat("/path/does/not/exist", new CephStat()); } @Test(expected=CephNotDirectoryException.class) @@ -582,6 +616,36 @@ public class CephMountTest { } /* + * fchmod + */ + + @Test + public void test_fchmod() throws Exception { + /* create a file */ + String path = makePath(); + int fd = createFile(path, 1); + + CephStat st = new CephStat(); + mount.lstat(path, st); + + /* flip a bit */ + int mode = st.mode; + if ((mode & 1) != 0) + mode -= 1; + else + mode += 1; + + mount.fchmod(fd, mode); + mount.close(fd); + + CephStat st2 = new CephStat(); + mount.lstat(path, st2); + assertTrue(st2.mode == mode); + + mount.unlink(path); + } + + /* * truncate */ @@ -867,4 +931,21 @@ public class CephMountTest { assertTrue(poolid >= 0); assertTrue(mount.get_pool_replication(poolid) > 0); } + + @Test + public void test_get_file_pool_name() throws Exception { + String path = makePath(); + int fd = createFile(path, 1); + String pool = mount.get_file_pool_name(fd); + mount.close(fd); + assertTrue(pool != null); + /* assumes using default data pool "data" */ + assertTrue(pool.compareTo("data") == 0); + mount.unlink(path); + } + + @Test(expected=IOException.class) + public void test_get_file_pool_name_ebadf() throws Exception { + String pool = mount.get_file_pool_name(-40); + } } diff --git a/src/java/test/com/ceph/fs/CephUnmountedTest.java b/src/java/test/com/ceph/fs/CephUnmountedTest.java index ae4d41e1e98..eb95e69fb03 100644 --- a/src/java/test/com/ceph/fs/CephUnmountedTest.java +++ b/src/java/test/com/ceph/fs/CephUnmountedTest.java @@ -78,6 +78,12 @@ public class CephUnmountedTest { } @Test(expected=CephNotMountedException.class) + public void test_stat() throws Exception { + CephStat stat = new CephStat(); + mount.stat("/a/path", stat); + } + + @Test(expected=CephNotMountedException.class) public void test_lstat() throws Exception { CephStat stat = new CephStat(); mount.lstat("/a/path", stat); @@ -145,4 +151,14 @@ public class CephUnmountedTest { public void test_get_pool_replication() throws Exception { mount.get_pool_replication(1); } + + @Test(expected=CephNotMountedException.class) + public void test_fchmod() throws Exception { + mount.fchmod(1, 0); + } + + @Test(expected=CephNotMountedException.class) + public void test_chmod() throws Exception { + mount.chmod("/foo", 0); + } } diff --git a/src/mon/Elector.cc b/src/mon/Elector.cc index e2ffa6bd571..199eaeae538 100644 --- a/src/mon/Elector.cc +++ b/src/mon/Elector.cc @@ -271,6 +271,8 @@ void Elector::handle_victory(MMonElection *m) assert(from < mon->rank); assert(m->epoch % 2 == 0); + leader_acked = -1; + // i should have seen this election if i'm getting the victory. if (m->epoch != epoch + 1) { dout(5) << "woah, that's a funny epoch, i must have rebooted. bumping and re-starting!" << dendl; diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc index 143ee65ed97..699db8968f1 100644 --- a/src/mon/Monitor.cc +++ b/src/mon/Monitor.cc @@ -736,7 +736,7 @@ void Monitor::reset() { dout(10) << "reset" << dendl; - timecheck_cleanup(); + timecheck_finish(); leader_since = utime_t(); if (!quorum.empty()) { @@ -1189,7 +1189,7 @@ void Monitor::win_election(epoch_t epoch, set<int>& active, uint64_t features) finish_election(); if (monmap->size() > 1) - timecheck(); + timecheck_start(); } void Monitor::lose_election(epoch_t epoch, set<int> &q, int l, uint64_t features) @@ -1213,6 +1213,7 @@ void Monitor::lose_election(epoch_t epoch, set<int> &q, int l, uint64_t features void Monitor::finish_election() { + timecheck_finish(); exited_quorum = utime_t(); finish_contexts(g_ceph_context, waitfor_quorum); finish_contexts(g_ceph_context, maybe_wait_for_quorum); @@ -2240,18 +2241,98 @@ bool Monitor::_ms_dispatch(Message *m) return ret; } +void Monitor::timecheck_start() +{ + dout(10) << __func__ << dendl; + timecheck_cleanup(); + timecheck_start_round(); +} + +void Monitor::timecheck_finish() +{ + dout(10) << __func__ << dendl; + timecheck_cleanup(); +} + +void Monitor::timecheck_start_round() +{ + dout(10) << __func__ << " curr " << timecheck_round << dendl; + assert(is_leader()); + + if (monmap->size() == 1) { + assert(0 == "We are alone; this shouldn't have been scheduled!"); + return; + } + + if (timecheck_round % 2) { + dout(10) << __func__ << " there's a timecheck going on" << dendl; + utime_t curr_time = ceph_clock_now(g_ceph_context); + double max = g_conf->mon_timecheck_interval*3; + if (curr_time - timecheck_round_start > max) { + dout(10) << __func__ << " keep current round going" << dendl; + goto out; + } else { + dout(10) << __func__ + << " finish current timecheck and start new" << dendl; + timecheck_cancel_round(); + } + } + + assert(timecheck_round % 2 == 0); + timecheck_acks = 0; + timecheck_round ++; + timecheck_round_start = ceph_clock_now(g_ceph_context); + dout(10) << __func__ << " new " << timecheck_round << dendl; + + timecheck(); +out: + dout(10) << __func__ << " setting up next event" << dendl; + timecheck_event = new C_TimeCheck(this); + timer.add_event_after(g_conf->mon_timecheck_interval, timecheck_event); +} + +void Monitor::timecheck_finish_round(bool success) +{ + dout(10) << __func__ << " curr " << timecheck_round << dendl; + assert(timecheck_round % 2); + timecheck_round ++; + timecheck_round_start = utime_t(); + + if (success) { + assert(timecheck_waiting.size() == 0); + assert(timecheck_acks == quorum.size()); + timecheck_report(); + return; + } + + dout(10) << __func__ << " " << timecheck_waiting.size() + << " peers still waiting:"; + for (map<entity_inst_t,utime_t>::iterator p = timecheck_waiting.begin(); + p != timecheck_waiting.end(); ++p) { + *_dout << " " << p->first.name; + } + *_dout << dendl; + timecheck_waiting.clear(); + + dout(10) << __func__ << " finished to " << timecheck_round << dendl; +} + +void Monitor::timecheck_cancel_round() +{ + timecheck_finish_round(false); +} + void Monitor::timecheck_cleanup() { timecheck_round = 0; timecheck_acks = 0; + timecheck_round_start = utime_t(); if (timecheck_event) { timer.cancel_event(timecheck_event); timecheck_event = NULL; } - - if (timecheck_waiting.size() > 0) - timecheck_waiting.clear(); + timecheck_waiting.clear(); timecheck_skews.clear(); timecheck_latencies.clear(); } @@ -2300,20 +2381,12 @@ void Monitor::timecheck() { dout(10) << __func__ << dendl; assert(is_leader()); - if (monmap->size() == 1) { - assert(0 == "We are alone; this shouldn't have been scheduled!"); + assert(0 == "We are alone; we shouldn't have gotten here!"); return; } + assert(timecheck_round % 2 != 0); - if ((timecheck_round % 2) != 0) { - dout(15) << __func__ - << " timecheck still in progress; laggy monitors maybe?" - << dendl; - goto out; - } - - timecheck_round++; timecheck_acks = 1; // we ack ourselves dout(10) << __func__ << " start timecheck epoch " << get_epoch() @@ -2336,12 +2409,6 @@ void Monitor::timecheck() dout(10) << __func__ << " send " << *m << " to " << inst << dendl; messenger->send_message(m, inst); } - -out: - dout(10) << __func__ << " setting up next event and timeout" << dendl; - timecheck_event = new C_TimeCheck(this); - - timer.add_event_after(g_conf->mon_timecheck_interval, timecheck_event); } health_status_t Monitor::timecheck_status(ostringstream &ss, @@ -2394,9 +2461,7 @@ void Monitor::handle_timecheck_leader(MTimeCheck *m) dout(1) << __func__ << " our clock was readjusted --" << " bump round and drop current check" << dendl; - timecheck_round++; - timecheck_acks = 0; - timecheck_waiting.clear(); + timecheck_cancel_round(); return; } @@ -2481,8 +2546,7 @@ void Monitor::handle_timecheck_leader(MTimeCheck *m) assert(timecheck_skews.size() == timecheck_acks); assert(timecheck_waiting.size() == 0); // everyone has acked, so bump the round to finish it. - timecheck_round++; - timecheck_report(); + timecheck_finish_round(); } } diff --git a/src/mon/Monitor.h b/src/mon/Monitor.h index 9716e351348..c7704bb16da 100644 --- a/src/mon/Monitor.h +++ b/src/mon/Monitor.h @@ -238,6 +238,7 @@ private: // finished. version_t timecheck_round; unsigned int timecheck_acks; + utime_t timecheck_round_start; /** * Time Check event. */ @@ -247,10 +248,15 @@ private: Monitor *mon; C_TimeCheck(Monitor *m) : mon(m) { } void finish(int r) { - mon->timecheck(); + mon->timecheck_start_round(); } }; + void timecheck_start(); + void timecheck_finish(); + void timecheck_start_round(); + void timecheck_finish_round(bool success = true); + void timecheck_cancel_round(); void timecheck_cleanup(); void timecheck_report(); void timecheck(); diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index 5786713e043..3d11cfffc0f 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -1457,6 +1457,8 @@ void OSDMonitor::tick() * ratio set by g_conf->mon_osd_min_in_ratio. So it's not really up to us. */ if (can_mark_out(-1)) { + set<int> down_cache; // quick cache of down subtrees + map<int,utime_t>::iterator i = down_pending_out.begin(); while (i != down_pending_out.end()) { int o = i->first; @@ -1483,6 +1485,20 @@ void OSDMonitor::tick() grace += my_grace; } + // is this an entire large subtree down? + if (g_conf->mon_osd_down_out_subtree_limit.length()) { + int type = osdmap.crush->get_type_id(g_conf->mon_osd_down_out_subtree_limit.c_str()); + if (type > 0) { + if (osdmap.containing_subtree_is_down(g_ceph_context, o, type, &down_cache)) { + dout(10) << "tick entire containing " << g_conf->mon_osd_down_out_subtree_limit + << " subtree for osd." << o << " is down; resetting timer" << dendl; + // reset timer, too. + down_pending_out[o] = now; + continue; + } + } + } + if (g_conf->mon_osd_down_out_interval > 0 && down.sec() >= grace) { dout(10) << "tick marking osd." << o << " OUT after " << down diff --git a/src/osd/OSDMap.cc b/src/osd/OSDMap.cc index 439ff06505a..c7d044ac6fd 100644 --- a/src/osd/OSDMap.cc +++ b/src/osd/OSDMap.cc @@ -172,6 +172,68 @@ int OSDMap::Incremental::identify_osd(uuid_d u) const return -1; } +bool OSDMap::subtree_is_down(int id, set<int> *down_cache) const +{ + if (id >= 0) + return is_down(id); + + if (down_cache && + down_cache->count(id)) { + return true; + } + + list<int> children; + crush->get_children(id, &children); + for (list<int>::iterator p = children.begin(); p != children.end(); ++p) { + if (!subtree_is_down(*p, down_cache)) { + return false; + } + } + if (down_cache) { + down_cache->insert(id); + } + return true; +} + +bool OSDMap::containing_subtree_is_down(CephContext *cct, int id, int subtree_type, set<int> *down_cache) const +{ + // use a stack-local down_cache if we didn't get one from the + // caller. then at least this particular call will avoid duplicated + // work. + set<int> local_down_cache; + if (!down_cache) { + down_cache = &local_down_cache; + } + + if (!subtree_is_down(id, down_cache)) { + ldout(cct, 30) << "containing_subtree_is_down(" << id << ") = false" << dendl; + return false; + } + + int current = id; + while (true) { + // invariant: current subtree is known to be down. + int type; + if (current >= 0) { + type = 0; + } else { + type = crush->get_bucket_type(current); + } + assert(type >= 0); + + // is this a big enough subtree to be done? + if (type >= subtree_type) { + ldout(cct, 30) << "containing_subtree_is_down(" << id << ") = true ... " << type << " >= " << subtree_type << dendl; + return true; + } + + int r = crush->get_immediate_parent_id(current, ¤t); + if (r < 0) { + return false; + } + } +} + void OSDMap::Incremental::encode_client_old(bufferlist& bl) const { __u16 v = 5; diff --git a/src/osd/OSDMap.h b/src/osd/OSDMap.h index 5105fc7ab0e..f3f84f0b470 100644 --- a/src/osd/OSDMap.h +++ b/src/osd/OSDMap.h @@ -316,6 +316,12 @@ private: bool is_in(int osd) const { return exists(osd) && !is_out(osd); } + + /** + * check if an entire crush subtre is down + */ + bool subtree_is_down(int id, set<int> *down_cache) const; + bool containing_subtree_is_down(CephContext *cct, int osd, int subtree_type, set<int> *down_cache) const; int identify_osd(const entity_addr_t& addr) const; int identify_osd(const uuid_d& u) const; diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc index 008462af426..55e420d6e74 100644 --- a/src/osd/osd_types.cc +++ b/src/osd/osd_types.cc @@ -509,6 +509,7 @@ void pg_pool_t::dump(Formatter *f) const f->dump_unsigned("flags", get_flags()); f->dump_int("type", get_type()); f->dump_int("size", get_size()); + f->dump_int("min_size", get_min_size()); f->dump_int("crush_ruleset", get_crush_ruleset()); f->dump_int("object_hash", get_object_hash()); f->dump_int("pg_num", get_pg_num()); @@ -829,6 +830,7 @@ ostream& operator<<(ostream& out, const pg_pool_t& p) { out << p.get_type_name() << " size " << p.get_size() + << " min_size " << p.get_min_size() << " crush_ruleset " << p.get_crush_ruleset() << " object_hash " << p.get_object_hash_name() << " pg_num " << p.get_pg_num() diff --git a/src/ps-ceph.pl b/src/ps-ceph.pl index dc236296494..03fc6061373 100755 --- a/src/ps-ceph.pl +++ b/src/ps-ceph.pl @@ -18,6 +18,7 @@ sub is_ceph_proc { return 1 if $cmdline =~ /\bceph\b/; return 1 if $cmdline =~ /\bceph-fuse\b/; + return 1 if $cmdline =~ /\brbd-fuse\b/; return 1 if $cmdline =~ /\bceph-mds\b/; return 1 if $cmdline =~ /\bceph-mon\b/; return 1 if $cmdline =~ /\bceph-osd\b/; diff --git a/src/rbd.cc b/src/rbd.cc index 833188ae33c..dd56bc9309e 100644 --- a/src/rbd.cc +++ b/src/rbd.cc @@ -1428,8 +1428,16 @@ static int do_kernel_add(const char *poolname, const char *imgname, // modprobe the rbd module if /sys/bus/rbd doesn't exist struct stat sb; - if ((stat("/sys/bus/rbd", &sb) < 0) || (!S_ISDIR(sb.st_mode))) - system("/sbin/modprobe rbd"); + if ((stat("/sys/bus/rbd", &sb) < 0) || (!S_ISDIR(sb.st_mode))) { + r = system("/sbin/modprobe rbd"); + if (r) { + if (r < 0) + cerr << "rbd: error executing modprobe as shell command!" << std::endl; + else + cerr << "rbd: modprobe rbd failed! (" << r << ")" <<std::endl; + return r; + } + } // write to /sys/bus/rbd/add int fd = open("/sys/bus/rbd/add", O_WRONLY); @@ -1448,8 +1456,16 @@ static int do_kernel_add(const char *poolname, const char *imgname, close(fd); // let udevadm do its job before we return - if (udevadm_settle) - system("/sbin/udevadm settle"); + if (udevadm_settle) { + r = system("/sbin/udevadm settle"); + if (r) { + if (r < 0) + cerr << "rbd: error executing udevadm as shell command!" << std::endl; + else + cerr << "rbd: '/sbin/udevadm settle' failed! (" << r << ")" <<std::endl; + return r; + } + } return r; } @@ -1664,8 +1680,16 @@ static int do_kernel_rm(const char *dev) r = close(fd); // let udevadm finish, if present - if (udevadm_settle) - system("/sbin/udevadm settle"); + if (udevadm_settle){ + r = system("/sbin/udevadm settle"); + if (r) { + if (r < 0) + cerr << "rbd: error executing udevadm as shell command!" << std::endl; + else + cerr << "rbd: '/sbin/udevadm settle' failed! (" << r << ")" <<std::endl; + return r; + } + } if (r < 0) r = -errno; diff --git a/src/rbd_fuse/rbd-fuse.c b/src/rbd_fuse/rbd-fuse.c new file mode 100644 index 00000000000..748976aabaf --- /dev/null +++ b/src/rbd_fuse/rbd-fuse.c @@ -0,0 +1,752 @@ +/* + * rbd-fuse + */ +#define FUSE_USE_VERSION 26 + +#include <stdio.h> +#include <stdlib.h> +#include <stddef.h> +#include <dirent.h> +#include <errno.h> +#include <fcntl.h> +#include <fuse.h> +#include <pthread.h> +#include <string.h> +#include <sys/types.h> +#include <unistd.h> +#include <getopt.h> +#include <inttypes.h> + +#include "include/rbd/librbd.h" + +static int gotrados = 0; +char *pool_name; +rados_t cluster; +rados_ioctx_t ioctx; + +static pthread_mutex_t readdir_lock; + +struct rbd_stat { + u_char valid; + rbd_image_info_t rbd_info; +}; + +struct rbd_options { + char *ceph_config; + char *pool_name; +}; + +struct rbd_image { + char *image_name; + struct rbd_image *next; +}; +struct rbd_image *rbd_images; + +struct rbd_openimage { + char *image_name; + rbd_image_t image; + struct rbd_stat rbd_stat; +}; +#define MAX_RBD_IMAGES 128 +struct rbd_openimage opentbl[MAX_RBD_IMAGES]; + +struct rbd_options rbd_options = {"/etc/ceph/ceph.conf", "rbd"}; + +#define rbdsize(fd) opentbl[fd].rbd_stat.rbd_info.size +#define rbdblksize(fd) opentbl[fd].rbd_stat.rbd_info.obj_size +#define rbdblkcnt(fd) opentbl[fd].rbd_stat.rbd_info.num_objs + +uint64_t imagesize = 1024ULL * 1024 * 1024; +uint64_t imageorder = 22ULL; +uint64_t imagefeatures = 1ULL; + +// Minimize calls to rbd_list: marks bracketing of opendir/<ops>/releasedir +int in_opendir; + +/* prototypes */ +int connect_to_cluster(rados_t *pcluster); +void enumerate_images(struct rbd_image **head); +int open_rbd_image(const char *image_name); +int find_openrbd(const char *path); + +void simple_err(const char *msg, int err); + +void +enumerate_images(struct rbd_image **head) +{ + char *ibuf; + size_t ibuf_len; + struct rbd_image *im, *next; + char *ip; + int actual_len; + + if (*head != NULL) { + for (im = *head; im != NULL;) { + next = im->next; + free(im); + im = next; + } + *head = NULL; + } + + ibuf_len = 1024; + ibuf = malloc(ibuf_len); + actual_len = rbd_list(ioctx, ibuf, &ibuf_len); + if (actual_len < 0) { + simple_err("rbd_list: error %d\n", actual_len); + return; + } + + fprintf(stderr, "pool %s: ", pool_name); + for (ip = ibuf; *ip != '\0' && ip < &ibuf[actual_len]; + ip += strlen(ip) + 1) { + fprintf(stderr, "%s, ", ip); + im = malloc(sizeof(*im)); + im->image_name = ip; + im->next = *head; + *head = im; + } + fprintf(stderr, "\n"); + return; +} + +int +find_openrbd(const char *path) +{ + int i; + + /* find in opentbl[] entry if already open */ + for (i = 0; i < MAX_RBD_IMAGES; i++) { + if ((opentbl[i].image_name != NULL) && + (strcmp(opentbl[i].image_name, path) == 0)) { + return i; + break; + } + } + return -1; +} + +int +open_rbd_image(const char *image_name) +{ + struct rbd_image *im; + struct rbd_openimage *rbd; + int fd, i; + int ret; + + if (image_name == (char *)NULL) + return -1; + + // relies on caller to keep rbd_images up to date + for (im = rbd_images; im != NULL; i++, im = im->next) { + if (strcmp(im->image_name, image_name) == 0) { + break; + } + } + if (im == NULL) + return -1; + + /* find in opentbl[] entry if already open */ + if ((fd = find_openrbd(image_name)) != -1) { + rbd = &opentbl[fd]; + } else { + // allocate an opentbl[] and open the image + for (i = 0; i < MAX_RBD_IMAGES; i++) { + if (opentbl[i].image == NULL) { + fd = i; + rbd = &opentbl[fd]; + rbd->image_name = strdup(image_name); + break; + } + } + if (i == MAX_RBD_IMAGES) + return -1; + ret = rbd_open(ioctx, rbd->image_name, &(rbd->image), NULL); + if (ret < 0) { + simple_err("open_rbd_image: can't open: ", ret); + return ret; + } + } + rbd_stat(rbd->image, &(rbd->rbd_stat.rbd_info), + sizeof(rbd_image_info_t)); + rbd->rbd_stat.valid = 1; + return fd; +} + +static void +iter_images(void *cookie, + void (*iter)(void *cookie, const char *image)) +{ + struct rbd_image *im; + + pthread_mutex_lock(&readdir_lock); + + for (im = rbd_images; im != NULL; im = im->next) + iter(cookie, im->image_name); + pthread_mutex_unlock(&readdir_lock); +} + +static void count_images_cb(void *cookie, const char *image) +{ + (*((unsigned int *)cookie))++; +} + +static int count_images(void) +{ + unsigned int count = 0; + + pthread_mutex_lock(&readdir_lock); + enumerate_images(&rbd_images); + pthread_mutex_unlock(&readdir_lock); + + iter_images(&count, count_images_cb); + return count; +} + +static int rbdfs_getattr(const char *path, struct stat *stbuf) +{ + int fd; + time_t now; + + if (!gotrados) + return -ENXIO; + + if (path[0] == 0) + return -ENOENT; + + memset(stbuf, 0, sizeof(struct stat)); + + if (strcmp(path, "/") == 0) { + + now = time(NULL); + stbuf->st_mode = S_IFDIR + 0755; + stbuf->st_nlink = 2+count_images(); + stbuf->st_uid = getuid(); + stbuf->st_gid = getgid(); + stbuf->st_size = 1024; + stbuf->st_blksize = 1024; + stbuf->st_blocks = 1; + stbuf->st_atime = now; + stbuf->st_mtime = now; + stbuf->st_ctime = now; + + return 0; + } + + if (!in_opendir) { + pthread_mutex_lock(&readdir_lock); + enumerate_images(&rbd_images); + pthread_mutex_unlock(&readdir_lock); + } + fd = open_rbd_image(path + 1); + if (fd < 0) + return -ENOENT; + + now = time(NULL); + stbuf->st_mode = S_IFREG | 0666; + stbuf->st_nlink = 1; + stbuf->st_uid = getuid(); + stbuf->st_gid = getgid(); + stbuf->st_size = rbdsize(fd); + stbuf->st_blksize = rbdblksize(fd); + stbuf->st_blocks = rbdblkcnt(fd); + stbuf->st_atime = now; + stbuf->st_mtime = now; + stbuf->st_ctime = now; + + return 0; +} + + +static int rbdfs_open(const char *path, struct fuse_file_info *fi) +{ + int fd; + + if (!gotrados) + return -ENXIO; + + if (path[0] == 0) + return -ENOENT; + + pthread_mutex_lock(&readdir_lock); + enumerate_images(&rbd_images); + pthread_mutex_unlock(&readdir_lock); + fd = open_rbd_image(path + 1); + if (fd < 0) + return -ENOENT; + + fi->fh = fd; + return 0; +} + +static int rbdfs_read(const char *path, char *buf, size_t size, + off_t offset, struct fuse_file_info *fi) +{ + size_t numread; + struct rbd_openimage *rbd; + + if (!gotrados) + return -ENXIO; + + rbd = &opentbl[fi->fh]; + numread = 0; + while (size > 0) { + ssize_t ret; + + ret = rbd_read(rbd->image, offset, size, buf); + + if (ret <= 0) + break; + buf += ret; + size -= ret; + offset += ret; + numread += ret; + } + + return numread; +} + +static int rbdfs_write(const char *path, const char *buf, size_t size, + off_t offset, struct fuse_file_info *fi) +{ + size_t numwritten; + struct rbd_openimage *rbd; + + if (!gotrados) + return -ENXIO; + + rbd = &opentbl[fi->fh]; + numwritten = 0; + while (size > 0) { + ssize_t ret; + + if (offset + size > rbdsize(fi->fh)) { + int r; + fprintf(stderr, "rbdfs_write resizing %s to 0x%"PRIxMAX"\n", + path, offset+size); + r = rbd_resize(rbd->image, offset+size); + if (r < 0) + return r; + + r = rbd_stat(rbd->image, &(rbd->rbd_stat.rbd_info), + sizeof(rbd_image_info_t)); + if (r < 0) + return r; + } + ret = rbd_write(rbd->image, offset, size, buf); + + if (ret < 0) + break; + buf += ret; + size -= ret; + offset += ret; + numwritten += ret; + } + + return numwritten; +} + +static void rbdfs_statfs_image_cb(void *num, const char *image) +{ + int fd; + + ((uint64_t *)num)[0]++; + + fd = open_rbd_image(image); + if (fd >= 0) + ((uint64_t *)num)[1] += rbdsize(fd); +} + +static int rbdfs_statfs(const char *path, struct statvfs *buf) +{ + uint64_t num[2]; + + if (!gotrados) + return -ENXIO; + + num[0] = 1; + num[1] = 0; + pthread_mutex_lock(&readdir_lock); + enumerate_images(&rbd_images); + pthread_mutex_unlock(&readdir_lock); + iter_images(num, rbdfs_statfs_image_cb); + +#define RBDFS_BSIZE 4096 + buf->f_bsize = RBDFS_BSIZE; + buf->f_frsize = RBDFS_BSIZE; + buf->f_blocks = num[1] / RBDFS_BSIZE; + buf->f_bfree = 0; + buf->f_bavail = 0; + buf->f_files = num[0]; + buf->f_ffree = 0; + buf->f_favail = 0; + buf->f_fsid = 0; + buf->f_flag = 0; + buf->f_namemax = PATH_MAX; + + return 0; +} + +static int rbdfs_fsync(const char *path, int datasync, + struct fuse_file_info *fi) +{ + if (!gotrados) + return -ENXIO; + rbd_flush(opentbl[fi->fh].image); + return 0; +} + +static int rbdfs_opendir(const char *path, struct fuse_file_info *fi) +{ + // only one directory, so global "in_opendir" flag should be fine + pthread_mutex_lock(&readdir_lock); + in_opendir++; + enumerate_images(&rbd_images); + pthread_mutex_unlock(&readdir_lock); + return 0; +} + +struct rbdfs_readdir_info { + void *buf; + fuse_fill_dir_t filler; +}; + +static void rbdfs_readdir_cb(void *_info, const char *name) +{ + struct rbdfs_readdir_info *info = _info; + + info->filler(info->buf, name, NULL, 0); +} + +static int rbdfs_readdir(const char *path, void *buf, fuse_fill_dir_t filler, + off_t offset, struct fuse_file_info *fi) +{ + struct rbdfs_readdir_info info = { buf, filler }; + + if (!gotrados) + return -ENXIO; + if (!in_opendir) + fprintf(stderr, "in readdir, but not inside opendir?\n"); + + if (strcmp(path, "/") != 0) + return -ENOENT; + + filler(buf, ".", NULL, 0); + filler(buf, "..", NULL, 0); + iter_images(&info, rbdfs_readdir_cb); + + return 0; +} +static int rbdfs_releasedir(const char *path, struct fuse_file_info *fi) +{ + // see opendir comments + pthread_mutex_lock(&readdir_lock); + in_opendir--; + pthread_mutex_unlock(&readdir_lock); + return 0; +} + +void * +rbdfs_init(struct fuse_conn_info *conn) +{ + int ret; + + // init cannot fail, so if we fail here, gotrados remains at 0, + // causing other operations to fail immediately with ENXIO + + ret = connect_to_cluster(&cluster); + if (ret < 0) + exit(90); + + pool_name = rbd_options.pool_name; + ret = rados_ioctx_create(cluster, pool_name, &ioctx); + if (ret < 0) + exit(91); +#if FUSE_VERSION >= FUSE_MAKE_VERSION(2, 8) + conn->want |= FUSE_CAP_BIG_WRITES; +#endif + gotrados = 1; + + // init's return value shows up in fuse_context.private_data, + // also to void (*destroy)(void *); useful? + return NULL; +} + +// return -errno on error. fi->fh is not set until open time + +int +rbdfs_create(const char *path, mode_t mode, struct fuse_file_info *fi) +{ + int r; + int order = imageorder; + + r = rbd_create2(ioctx, path+1, imagesize, imagefeatures, &order); + return r; +} + +int +rbdfs_utime(const char *path, struct utimbuf *utime) +{ + // called on create; not relevant + return 0; +} + +int +rbdfs_unlink(const char *path) +{ + int fd = find_openrbd(path); + if (fd != -1) { + struct rbd_openimage *rbd = &opentbl[fd]; + rbd_close(rbd->image); + rbd->image = 0; + free(rbd->image_name); + rbd->rbd_stat.valid = 0; + } + return rbd_remove(ioctx, path+1); +} + + +int +rbdfs_truncate(const char *path, off_t size) +{ + int fd; + int r; + struct rbd_openimage *rbd; + + if ((fd = open_rbd_image(path+1)) < 0) + return -ENOENT; + + rbd = &opentbl[fd]; + fprintf(stderr, "truncate %s to %"PRIdMAX" (0x%"PRIxMAX")\n", path, size, size); + r = rbd_resize(rbd->image, size); + if (r < 0) + return r; + + r = rbd_stat(rbd->image, &(rbd->rbd_stat.rbd_info), + sizeof(rbd_image_info_t)); + if (r < 0) + return r; + return 0; +} + +/** + * set an xattr on path, with name/value, length size. + * Presumably flags are from Linux, as in XATTR_CREATE or + * XATTR_REPLACE (both "set", but fail if exist vs fail if not exist. + * + * We accept xattrs only on the root node. + * + * All values converted with strtoull, so can be expressed in any base + */ + +struct rbdfuse_attr { + char *attrname; + uint64_t *attrvalp; +} attrs[] = { + { "user.rbdfuse.imagesize", &imagesize }, + { "user.rbdfuse.imageorder", &imageorder }, + { "user.rbdfuse.imagefeatures", &imagefeatures }, + { NULL } +}; + +int +rbdfs_setxattr(const char *path, const char *name, const char *value, + size_t size, int flags) +{ + struct rbdfuse_attr *ap; + if (strcmp(path, "/") != 0) + return -EINVAL; + + for (ap = attrs; ap->attrname != NULL; ap++) { + if (strcmp(name, ap->attrname) == 0) { + *ap->attrvalp = strtoull(value, NULL, 0); + fprintf(stderr, "rbd-fuse: %s set to 0x%"PRIx64"\n", + ap->attrname, *ap->attrvalp); + return 0; + } + } + return -EINVAL; +} + +int +rbdfs_getxattr(const char *path, const char *name, char *value, + size_t size) +{ + struct rbdfuse_attr *ap; + char buf[128]; + // allow gets on other files; ls likes to ask for things like + // security.* + + for (ap = attrs; ap->attrname != NULL; ap++) { + if (strcmp(name, ap->attrname) == 0) { + sprintf(buf, "%"PRIu64, *ap->attrvalp); + if (value != NULL && size >= strlen(buf)) + strcpy(value, buf); + fprintf(stderr, "rbd-fuse: get %s\n", ap->attrname); + return (strlen(buf)); + } + } + return 0; +} + +int +rbdfs_listxattr(const char *path, char *list, size_t len) +{ + struct rbdfuse_attr *ap; + int required_len = 0; + + if (strcmp(path, "/") != 0) + return -EINVAL; + + for (ap = attrs; ap->attrname != NULL; ap++) + required_len += strlen(ap->attrname) + 1; + if (len >= required_len) { + for (ap = attrs; ap->attrname != NULL; ap++) { + sprintf(list, "%s", ap->attrname); + list += strlen(ap->attrname) + 1; + } + } + return required_len; +} + +static struct fuse_operations rbdfs_oper = { + .create = rbdfs_create, + .fsync = rbdfs_fsync, + .getattr = rbdfs_getattr, + .getxattr = rbdfs_getxattr, + .init = rbdfs_init, + .listxattr = rbdfs_listxattr, + .open = rbdfs_open, + .opendir = rbdfs_opendir, + .read = rbdfs_read, + .readdir = rbdfs_readdir, + .releasedir = rbdfs_releasedir, + .setxattr = rbdfs_setxattr, + .statfs = rbdfs_statfs, + .truncate = rbdfs_truncate, + .unlink = rbdfs_unlink, + .utime = rbdfs_utime, + .write = rbdfs_write, +}; + +enum { + KEY_HELP, + KEY_VERSION, + KEY_CEPH_CONFIG, + KEY_CEPH_CONFIG_LONG, + KEY_RADOS_POOLNAME, + KEY_RADOS_POOLNAME_LONG +}; + +static struct fuse_opt rbdfs_opts[] = { + FUSE_OPT_KEY("-h", KEY_HELP), + FUSE_OPT_KEY("--help", KEY_HELP), + FUSE_OPT_KEY("-V", KEY_VERSION), + FUSE_OPT_KEY("--version", KEY_VERSION), + {"-c %s", offsetof(struct rbd_options, ceph_config), KEY_CEPH_CONFIG}, + {"--configfile=%s", offsetof(struct rbd_options, ceph_config), + KEY_CEPH_CONFIG_LONG}, + {"-p %s", offsetof(struct rbd_options, pool_name), KEY_RADOS_POOLNAME}, + {"--poolname=%s", offsetof(struct rbd_options, pool_name), + KEY_RADOS_POOLNAME_LONG}, +}; + +static void usage(const char *progname) +{ + fprintf(stderr, +"Usage: %s mountpoint [options]\n" +"\n" +"General options:\n" +" -h --help print help\n" +" -V --version print version\n" +" -c --configfile ceph configuration file [/etc/ceph/ceph.conf]\n" +" -p --poolname rados pool name [rbd]\n" +"\n", progname); +} + +static int rbdfs_opt_proc(void *data, const char *arg, int key, + struct fuse_args *outargs) +{ + if (key == KEY_HELP) { + usage(outargs->argv[0]); + fuse_opt_add_arg(outargs, "-ho"); + fuse_main(outargs->argc, outargs->argv, &rbdfs_oper, NULL); + exit(1); + } + + if (key == KEY_VERSION) { + fuse_opt_add_arg(outargs, "--version"); + fuse_main(outargs->argc, outargs->argv, &rbdfs_oper, NULL); + exit(0); + } + + if (key == KEY_CEPH_CONFIG) { + if (rbd_options.ceph_config != NULL) { + free(rbd_options.ceph_config); + rbd_options.ceph_config = NULL; + } + rbd_options.ceph_config = strdup(arg+2); + return 0; + } + + if (key == KEY_RADOS_POOLNAME) { + if (rbd_options.pool_name != NULL) { + free(rbd_options.pool_name); + rbd_options.pool_name = NULL; + } + rbd_options.pool_name = strdup(arg+2); + return 0; + } + + return 1; +} + +void +simple_err(const char *msg, int err) +{ + fprintf(stderr, "%s: %s\n", msg, strerror(-err)); + return; +} + +int +connect_to_cluster(rados_t *pcluster) +{ + int r; + + r = rados_create(pcluster, NULL); + if (r < 0) { + simple_err("Could not create cluster handle", r); + return r; + } + rados_conf_parse_env(*pcluster, NULL); + r = rados_conf_read_file(*pcluster, rbd_options.ceph_config); + if (r < 0) { + simple_err("Error reading Ceph config file", r); + goto failed_shutdown; + } + r = rados_connect(*pcluster); + if (r < 0) { + simple_err("Error connecting to cluster", r); + goto failed_shutdown; + } + + return 0; + +failed_shutdown: + rados_shutdown(*pcluster); + return r; +} + +int main(int argc, char *argv[]) +{ + struct fuse_args args = FUSE_ARGS_INIT(argc, argv); + + if (fuse_opt_parse(&args, &rbd_options, rbdfs_opts, rbdfs_opt_proc) + == -1) { + exit(1); + } + + pthread_mutex_init(&readdir_lock, NULL); + + return fuse_main(args.argc, args.argv, &rbdfs_oper, NULL); +} diff --git a/src/rgw/rgw_rest_s3.cc b/src/rgw/rgw_rest_s3.cc index b2925940f77..dfa6827c7ff 100644 --- a/src/rgw/rgw_rest_s3.cc +++ b/src/rgw/rgw_rest_s3.cc @@ -707,7 +707,7 @@ int RGWPostObj_ObjStore_S3::get_params() string whitespaces (" \t\f\v\n\r"); // get the part boundary - string req_content_type_str = s->env->get("CONTENT_TYPE"); + string req_content_type_str = s->env->get("CONTENT_TYPE", ""); string req_content_type; map<string, string> params; diff --git a/src/test/cli/osdmaptool/clobber.t b/src/test/cli/osdmaptool/clobber.t index 46194db9ffb..9bbe4d4ceeb 100644 --- a/src/test/cli/osdmaptool/clobber.t +++ b/src/test/cli/osdmaptool/clobber.t @@ -19,9 +19,9 @@ modified \d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d+ (re) flags - pool 0 'data' rep size 2 crush_ruleset 0 object_hash rjenkins pg_num 192 pgp_num 192 last_change 0 owner 0 crash_replay_interval 45 - pool 1 'metadata' rep size 2 crush_ruleset 1 object_hash rjenkins pg_num 192 pgp_num 192 last_change 0 owner 0 - pool 2 'rbd' rep size 2 crush_ruleset 2 object_hash rjenkins pg_num 192 pgp_num 192 last_change 0 owner 0 + pool 0 'data' rep size 2 min_size 1 crush_ruleset 0 object_hash rjenkins pg_num 192 pgp_num 192 last_change 0 owner 0 crash_replay_interval 45 + pool 1 'metadata' rep size 2 min_size 1 crush_ruleset 1 object_hash rjenkins pg_num 192 pgp_num 192 last_change 0 owner 0 + pool 2 'rbd' rep size 2 min_size 1 crush_ruleset 2 object_hash rjenkins pg_num 192 pgp_num 192 last_change 0 owner 0 max_osd 3 @@ -41,9 +41,9 @@ modified \d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d+ (re) flags - pool 0 'data' rep size 2 crush_ruleset 0 object_hash rjenkins pg_num 64 pgp_num 64 last_change 0 owner 0 crash_replay_interval 45 - pool 1 'metadata' rep size 2 crush_ruleset 1 object_hash rjenkins pg_num 64 pgp_num 64 last_change 0 owner 0 - pool 2 'rbd' rep size 2 crush_ruleset 2 object_hash rjenkins pg_num 64 pgp_num 64 last_change 0 owner 0 + pool 0 'data' rep size 2 min_size 1 crush_ruleset 0 object_hash rjenkins pg_num 64 pgp_num 64 last_change 0 owner 0 crash_replay_interval 45 + pool 1 'metadata' rep size 2 min_size 1 crush_ruleset 1 object_hash rjenkins pg_num 64 pgp_num 64 last_change 0 owner 0 + pool 2 'rbd' rep size 2 min_size 1 crush_ruleset 2 object_hash rjenkins pg_num 64 pgp_num 64 last_change 0 owner 0 max_osd 1 diff --git a/src/test/cli/osdmaptool/create-print.t b/src/test/cli/osdmaptool/create-print.t index a01d27d69fa..81b91947359 100644 --- a/src/test/cli/osdmaptool/create-print.t +++ b/src/test/cli/osdmaptool/create-print.t @@ -10,9 +10,9 @@ modified \d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d+ (re) flags - pool 0 'data' rep size 2 crush_ruleset 0 object_hash rjenkins pg_num 192 pgp_num 192 last_change 0 owner 0 crash_replay_interval 45 - pool 1 'metadata' rep size 2 crush_ruleset 1 object_hash rjenkins pg_num 192 pgp_num 192 last_change 0 owner 0 - pool 2 'rbd' rep size 2 crush_ruleset 2 object_hash rjenkins pg_num 192 pgp_num 192 last_change 0 owner 0 + pool 0 'data' rep size 2 min_size 1 crush_ruleset 0 object_hash rjenkins pg_num 192 pgp_num 192 last_change 0 owner 0 crash_replay_interval 45 + pool 1 'metadata' rep size 2 min_size 1 crush_ruleset 1 object_hash rjenkins pg_num 192 pgp_num 192 last_change 0 owner 0 + pool 2 'rbd' rep size 2 min_size 1 crush_ruleset 2 object_hash rjenkins pg_num 192 pgp_num 192 last_change 0 owner 0 max_osd 3 |