19 files changed, 610 insertions, 30 deletions
diff --git a/AUTHORS b/AUTHORS
index 08f3b1ca729..289f54bf67b 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -13,3 +13,7 @@ Patience Warnick <patience@newdream.net>
 Yehuda Sadeh-Weinraub <yehudasa@gmail.com>
 Greg Farnum <gregf@hq.newdream.net>
 
+Contributors
+------------
+
+Loic Dachary <loic@dachary.org>
diff --git a/COPYING b/COPYING
index 20ab537172d..888e30e679f 100644
--- a/COPYING
+++ b/COPYING
@@ -98,3 +98,6 @@ License:
 
 
 
+Files: test/common/Throttle.cc
+Copyright: Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com>
+License: LGPL2 or later
diff --git a/README b/README
index eb10688544a..c249abcb90b 100644
--- a/README
+++ b/README
@@ -121,9 +121,9 @@ To build the source code, you must install the following:
 - libatomic-ops-dev
 - libboost-program-options-dev
 - libboost-thread-dev
-
+- libexpat1-dev
 
 
 For example:
 
-	$ apt-get install automake autoconf automake gcc g++ libboost-dev libedit-dev libssl-dev libtool libfcgi libfcgi-dev libfuse-dev linux-kernel-headers libcrypto++-dev libaio-dev libgoogle-perftools-dev libkeyutils-dev uuid-dev libatomic-ops-dev libboost-program-options-dev libboost-thread-dev
+	$ apt-get install automake autoconf automake gcc g++ libboost-dev libedit-dev libssl-dev libtool libfcgi libfcgi-dev libfuse-dev linux-kernel-headers libcrypto++-dev libaio-dev libgoogle-perftools-dev libkeyutils-dev uuid-dev libatomic-ops-dev libboost-program-options-dev libboost-thread-dev libexpat1-dev
diff --git a/doc/radosgw/config.rst b/doc/radosgw/config.rst
index 5c2c2c27216..a4dc85aff4a 100644
--- a/doc/radosgw/config.rst
+++ b/doc/radosgw/config.rst
@@ -22,6 +22,7 @@ For example::
 		rgw socket path = /tmp/radosgw.sock
 		log file = /var/log/ceph/radosgw.log
 
+.. note:: ``host`` must be your machine hostname, not FQDN.
 
 Deploy ``ceph.conf``
 ====================
@@ -315,10 +316,9 @@ packages.
 
 RGW's ``user:subuser`` tuple maps to the ``tenant:user`` tuple expected by Swift.
 
-.. important:: RGW's Swift authentication service only supports
-   built-in Swift authentication (``-V 1.0``) at this point. There is
-   currently no way to make RGW authenticate users via OpenStack
-   Identity Service (Keystone).
+.. note:: RGW's Swift authentication service only supports built-in Swift 
+   authentication (``-V 1.0``). To make RGW authenticate users via OpenStack
+   Identity Service (Keystone), see below.
 
 Integrating with OpenStack Keystone
 ===================================
@@ -332,7 +332,7 @@ by RGW.
 The following config options are available for Keystone integration::
 
 	[client.radosgw.gateway]
-		rgw keystone url = {keystone server url}
+		rgw keystone url = {keystone server url:keystone server admin port}
 		rgw keystone admin token = {keystone admin token}
 		rgw keystone accepted roles = {accepted user roles}
 		rgw keystone token cache size = {number of tokens to cache}
@@ -349,7 +349,8 @@ Keystone itself needs to be configured to point to RGW as an object-storage
 endpoint::
 
 	keystone service-create --name swift --type-object-store
-	keystone endpoint-create --service-id <id> --public-url http://radosgw.example.com/swift/v1
+	keystone endpoint-create --service-id <id> --publicurl http://radosgw.example.com/swift/v1 \
+		--internalurl http://radosgw.example.com/swift/v1 --adminurl http://radosgw.example.com/swift/v1
 
 
 The keystone url is the Keystone admin RESTful api url. The admin token is the
diff --git a/qa/workunits/mon/crush_ops.sh b/qa/workunits/mon/crush_ops.sh
new file mode 100755
index 00000000000..735646b5ca0
--- /dev/null
+++ b/qa/workunits/mon/crush_ops.sh
@@ -0,0 +1,23 @@
+#!/bin/sh -x
+
+set -e
+
+ceph osd crush dump
+ceph osd crush rule dump
+ceph osd crush rule ls
+ceph osd crush rule list
+
+ceph osd crush rule create-simple foo default host
+ceph osd crush rule create-simple foo default host
+ceph osd crush rule create-simple bar default host
+
+ceph osd crush rule ls | grep foo
+
+ceph osd crush rule rm foo
+ceph osd crush rule rm foo  # idempotent
+ceph osd crush rule rm bar
+
+# can't delete in-use rules, tho:
+ceph osd crush rule rm data && exit 1 || true
+
+echo OK
diff --git a/src/Makefile.am b/src/Makefile.am
index 622834f0b0e..6a9bb147824 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -671,6 +671,12 @@ unittest_log_LDADD = libcommon.la ${UNITTEST_LDADD}
 unittest_log_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS} -O2
 check_PROGRAMS += unittest_log
 
+unittest_throttle_SOURCES = test/common/Throttle.cc
+unittest_throttle_LDFLAGS = $(PTHREAD_CFLAGS) ${AM_LDFLAGS}
+unittest_throttle_LDADD = libcommon.la ${LIBGLOBAL_LDA} ${UNITTEST_LDADD}
+unittest_throttle_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS} -O2
+check_PROGRAMS += unittest_throttle
+
 unittest_base64_SOURCES = test/base64.cc
 unittest_base64_LDFLAGS = $(PTHREAD_CFLAGS) ${AM_LDFLAGS}
 unittest_base64_LDADD = libcephfs.la -lm ${UNITTEST_LDADD}
diff --git a/src/client/Client.cc b/src/client/Client.cc
index 6cff22be9f0..4ff30797284 100644
--- a/src/client/Client.cc
+++ b/src/client/Client.cc
@@ -2566,7 +2566,7 @@ public:
   }
 };
 
-bool Client::_flush(Inode *in)
+bool Client::_flush(Inode *in, Context *onfinish)
 {
   ldout(cct, 10) << "_flush " << *in << dendl;
 
@@ -2575,7 +2575,9 @@ bool Client::_flush(Inode *in)
     return true;
   }
 
-  Context *onfinish = new C_Client_PutInode(this, in);
+  if (!onfinish) {
+    onfinish = new C_Client_PutInode(this, in);
+  }
   bool safe = objectcacher->flush_set(&in->oset, onfinish);
   if (safe) {
     onfinish->complete(0);
@@ -5877,11 +5879,19 @@ int Client::_fsync(Fh *f, bool syncdataonly)
   Inode *in = f->inode;
   tid_t wait_on_flush = 0;
   bool flushed_metadata = false;
+  Mutex lock("Client::_fsync::lock");
+  Cond cond;
+  bool done = false;
+  C_SafeCond *object_cacher_completion = NULL;
 
   ldout(cct, 3) << "_fsync(" << f << ", " << (syncdataonly ? "dataonly)":"data+metadata)") << dendl;
   
-  if (cct->_conf->client_oc)
-    _flush(in);
+  if (cct->_conf->client_oc) {
+    object_cacher_completion = new C_SafeCond(&lock, &cond, &done, &r);
+    in->get(); // take a reference; C_SafeCond doesn't and _flush won't either
+    _flush(in, object_cacher_completion);
+    ldout(cct, 15) << "using return-valued form of _fsync" << dendl;
+  }
   
   if (!syncdataonly && (in->dirty_caps & ~CEPH_CAP_ANY_FILE_WR)) {
     for (map<int, Cap*>::iterator iter = in->caps.begin(); iter != in->caps.end(); ++iter) {
@@ -5893,18 +5903,35 @@ int Client::_fsync(Fh *f, bool syncdataonly)
     flushed_metadata = true;
   } else ldout(cct, 10) << "no metadata needs to commit" << dendl;
 
-  // FIXME: this can starve
-  while (in->cap_refs[CEPH_CAP_FILE_BUFFER] > 0) {
-    ldout(cct, 10) << "ino " << in->ino << " has " << in->cap_refs[CEPH_CAP_FILE_BUFFER]
-	     << " uncommitted, waiting" << dendl;
-    wait_on_list(in->waitfor_commit);
+  if (object_cacher_completion) { // wait on a real reply instead of guessing
+    client_lock.Unlock();
+    lock.Lock();
+    ldout(cct, 15) << "waiting on data to flush" << dendl;
+    while (!done)
+      cond.Wait(lock);
+    lock.Unlock();
+    client_lock.Lock();
+    put_inode(in);
+    ldout(cct, 15) << "got " << r << " from flush writeback" << dendl;
+  } else {
+    // FIXME: this can starve
+    while (in->cap_refs[CEPH_CAP_FILE_BUFFER] > 0) {
+      ldout(cct, 10) << "ino " << in->ino << " has " << in->cap_refs[CEPH_CAP_FILE_BUFFER]
+		     << " uncommitted, waiting" << dendl;
+      wait_on_list(in->waitfor_commit);
+    }
   }
 
-  if (!flushed_metadata) wait_sync_caps(wait_on_flush); //this could wait longer than strictly necessary,
-                                                    //but on a sync the user can put up with it
-
-  ldout(cct, 10) << "ino " << in->ino << " has no uncommitted writes" << dendl;
+  if (!r) {
+    if (flushed_metadata) wait_sync_caps(wait_on_flush);
+    // this could wait longer than strictly necessary,
+    // but on a sync the user can put up with it
 
+    ldout(cct, 10) << "ino " << in->ino << " has no uncommitted writes" << dendl;
+  } else {
+    ldout(cct, 1) << "ino " << in->ino << " failed to commit to disk! "
+		  << cpp_strerror(-r) << dendl;
+  }
   return r;
 }
 
diff --git a/src/client/Client.h b/src/client/Client.h
index b3b1f87cf46..3fcdf481ad1 100644
--- a/src/client/Client.h
+++ b/src/client/Client.h
@@ -451,7 +451,19 @@ protected:
   void _invalidate_inode_cache(Inode *in, int64_t off, int64_t len, bool keep_caps);
   void _async_invalidate(Inode *in, int64_t off, int64_t len, bool keep_caps);
   void _release(Inode *in);
-  bool _flush(Inode *in);
+  
+  /**
+   * Initiate a flush of the data associated with the given inode.
+   * If you specify a Context, you are responsible for holding an inode
+   * reference for the duration of the flush. If not, _flush() will
+   * take the reference for you.
+   * @param in The Inode whose data you wish to flush.
+   * @param c The Context you wish us to complete once the data is
+   * flushed. If already flushed, this will be called in-line.
+   * 
+   * @returns true if the data was already flushed, false otherwise.
+   */
+  bool _flush(Inode *in, Context *c=NULL);
   void _flush_range(Inode *in, int64_t off, uint64_t size);
   void _flushed(Inode *in);
   void flush_set_callback(ObjectCacher::ObjectSet *oset);
diff --git a/src/common/Throttle.cc b/src/common/Throttle.cc
index 844263aa111..82ffe7a9fc5 100644
--- a/src/common/Throttle.cc
+++ b/src/common/Throttle.cc
@@ -65,7 +65,7 @@ Throttle::~Throttle()
 void Throttle::_reset_max(int64_t m)
 {
   assert(lock.is_locked());
-  if (m < ((int64_t)max.read()) && !cond.empty())
+  if (!cond.empty())
     cond.front()->SignalOne();
   logger->set(l_throttle_max, m);
   max.set((size_t)m);
diff --git a/src/common/config_opts.h b/src/common/config_opts.h
index 0d2e011721a..5e0449e3606 100644
--- a/src/common/config_opts.h
+++ b/src/common/config_opts.h
@@ -434,7 +434,7 @@ OPTION(filestore_kill_at, OPT_INT, 0)            // inject a failure at the n'th
 OPTION(filestore_inject_stall, OPT_INT, 0)       // artificially stall for N seconds in op queue thread
 OPTION(filestore_fail_eio, OPT_BOOL, true)       // fail/crash on EIO
 OPTION(journal_dio, OPT_BOOL, true)
-OPTION(journal_aio, OPT_BOOL, false)
+OPTION(journal_aio, OPT_BOOL, true)
 OPTION(journal_block_align, OPT_BOOL, true)
 OPTION(journal_max_write_bytes, OPT_INT, 10 << 20)
 OPTION(journal_max_write_entries, OPT_INT, 100)
diff --git a/src/crush/CrushWrapper.cc b/src/crush/CrushWrapper.cc
index 45e4fb53de6..a22f23509c9 100644
--- a/src/crush/CrushWrapper.cc
+++ b/src/crush/CrushWrapper.cc
@@ -488,6 +488,61 @@ void CrushWrapper::reweight(CephContext *cct)
   }
 }
 
+int CrushWrapper::add_simple_rule(string name, string root_name, string failure_domain_name)
+{
+  if (rule_exists(name))
+    return -EEXIST;
+  if (!name_exists(root_name.c_str()))
+    return -ENOENT;
+  int root = get_item_id(root_name.c_str());
+  int type = 0;
+  if (failure_domain_name.length()) {
+    type = get_type_id(failure_domain_name.c_str());
+    if (type <= 0) // bah, returns 0 on error; but its ok, device isn't a domain really
+      return -EINVAL;
+  }
+
+  int ruleset = 0;
+  for (int i = 0; i < get_max_rules(); i++) {
+    if (rule_exists(i) &&
+	get_rule_mask_ruleset(i) >= ruleset) {
+      ruleset = get_rule_mask_ruleset(i) + 1;
+    }
+  }
+
+  crush_rule *rule = crush_make_rule(3, ruleset, 1 /* pg_pool_t::TYPE_REP */, 1, 10);
+  assert(rule);
+  crush_rule_set_step(rule, 0, CRUSH_RULE_TAKE, root, 0);
+  if (type)
+    crush_rule_set_step(rule, 1,
+			CRUSH_RULE_CHOOSE_LEAF_FIRSTN,
+			CRUSH_CHOOSE_N,
+			type);
+  else
+    crush_rule_set_step(rule, 1,
+			CRUSH_RULE_CHOOSE_FIRSTN,
+			CRUSH_CHOOSE_N,
+			0);
+  crush_rule_set_step(rule, 2, CRUSH_RULE_EMIT, 0, 0);
+  int rno = crush_add_rule(crush, rule, -1);
+  set_rule_name(rno, name.c_str());
+  have_rmaps = false;
+  return rno;
+}
+
+int CrushWrapper::remove_rule(int ruleno)
+{
+  if (ruleno >= (int)crush->max_rules)
+    return -ENOENT;
+  if (crush->rules[ruleno] == NULL)
+    return -ENOENT;
+  crush_destroy_rule(crush->rules[ruleno]);
+  crush->rules[ruleno] = NULL;
+  rule_name_map.erase(ruleno);
+  have_rmaps = false;
+  return 0;
+}
+
 void CrushWrapper::encode(bufferlist& bl, bool lean) const
 {
   assert(crush);
@@ -817,6 +872,12 @@ void CrushWrapper::dump(Formatter *f) const
   f->close_section();
 
   f->open_array_section("rules");
+  dump_rules(f);
+  f->close_section();
+}
+
+void CrushWrapper::dump_rules(Formatter *f) const
+{
   for (int i=0; i<get_max_rules(); i++) {
     if (!rule_exists(i))
       continue;
@@ -872,7 +933,15 @@ void CrushWrapper::dump(Formatter *f) const
     f->close_section();
     f->close_section();
   }
-  f->close_section();
+}
+
+void CrushWrapper::list_rules(Formatter *f) const
+{
+  for (int rule = 0; rule < get_max_rules(); rule++) {
+    if (!rule_exists(rule))
+      continue;
+    f->dump_string("name", get_rule_name(rule));
+  }
 }
 
 void CrushWrapper::generate_test_instances(list<CrushWrapper*>& o)
diff --git a/src/crush/CrushWrapper.h b/src/crush/CrushWrapper.h
index 7def6e4ab34..0b919cba3ec 100644
--- a/src/crush/CrushWrapper.h
+++ b/src/crush/CrushWrapper.h
@@ -221,12 +221,15 @@ public:
   }
 
   // rule names
-  int get_rule_id(const char *n) {
-    string name(n);
+  bool rule_exists(string name) {
+    build_rmaps();
+    return rule_name_rmap.count(name);
+  }
+  int get_rule_id(string name) {
     build_rmaps();
     if (rule_name_rmap.count(name))
       return rule_name_rmap[name];
-    return 0;  /* hrm */
+    return -ENOENT;
   }
   const char *get_rule_name(int t) const {
     std::map<int,string>::const_iterator p = rule_name_map.find(t);
@@ -527,6 +530,9 @@ public:
     return set_rule_step(ruleno, step, CRUSH_RULE_EMIT, 0, 0);
   }
 
+  int add_simple_rule(string name, string root_name, string failure_domain_type);
+
+  int remove_rule(int ruleno);
 
 
   /** buckets **/
@@ -735,6 +741,8 @@ public:
   void decode(bufferlist::iterator &blp);
   void decode_crush_bucket(crush_bucket** bptr, bufferlist::iterator &blp);
   void dump(Formatter *f) const;
+  void dump_rules(Formatter *f) const;
+  void list_rules(Formatter *f) const;
   static void generate_test_instances(list<CrushWrapper*>& o);
 };
 WRITE_CLASS_ENCODER(CrushWrapper)
diff --git a/src/crush/crush.c b/src/crush/crush.c
index 19a765228e9..1e83eb866bb 100644
--- a/src/crush/crush.c
+++ b/src/crush/crush.c
@@ -116,7 +116,7 @@ void crush_destroy(struct crush_map *map)
 	if (map->rules) {
 		__u32 b;
 		for (b = 0; b < map->max_rules; b++)
-			kfree(map->rules[b]);
+			crush_destroy_rule(map->rules[b]);
 		kfree(map->rules);
 	}
 
@@ -124,6 +124,11 @@ void crush_destroy(struct crush_map *map)
 	kfree(map);
 }
 
+void crush_destroy_rule(struct crush_rule *rule)
+{
+	kfree(rule);
+}
+
 // methods to check for safe arithmetic operations
 int crush_addition_is_unsafe(__u32 a, __u32 b)
 {
diff --git a/src/crush/crush.h b/src/crush/crush.h
index 9fd37e9e516..82d032879d9 100644
--- a/src/crush/crush.h
+++ b/src/crush/crush.h
@@ -185,6 +185,7 @@ extern void crush_destroy_bucket_list(struct crush_bucket_list *b);
 extern void crush_destroy_bucket_tree(struct crush_bucket_tree *b);
 extern void crush_destroy_bucket_straw(struct crush_bucket_straw *b);
 extern void crush_destroy_bucket(struct crush_bucket *b);
+extern void crush_destroy_rule(struct crush_rule *r);
 extern void crush_destroy(struct crush_map *map);
 
 static inline int crush_calc_tree_node(int i)
diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc
index 507eed74c42..209bb982d86 100644
--- a/src/mon/OSDMonitor.cc
+++ b/src/mon/OSDMonitor.cc
@@ -1880,6 +1880,38 @@ bool OSDMonitor::preprocess_command(MMonCommand *m)
 	}
       }
     }
+    else if (m->cmd[1] == "find") {
+      if (m->cmd.size() < 3) {
+	ss << "usage: osd find <osd-id>";
+	r = -EINVAL;
+	goto out;
+      }
+      long osd = parse_osd_id(m->cmd[2].c_str(), &ss);
+      if (osd < 0) {
+	r = -EINVAL;
+	goto out;
+      }
+      if (!osdmap.exists(osd)) {
+	ss << "osd." << osd << " does not exist";
+	r = -ENOENT;
+	goto out;
+      }
+      JSONFormatter jf(true);
+      jf.open_object_section("osd_location");
+      jf.dump_int("osd", osd);
+      jf.dump_stream("ip") << osdmap.get_addr(osd);
+      jf.open_object_section("crush_location");
+      map<string,string> loc = osdmap.crush->get_full_location(osd);
+      for (map<string,string>::iterator p = loc.begin(); p != loc.end(); ++p)
+	jf.dump_string(p->first.c_str(), p->second);
+      jf.close_section();
+      jf.close_section();
+      ostringstream rs;
+      jf.flush(rs);
+      rs << "\n";
+      rdata.append(rs.str());
+      r = 0;
+    }
     else if (m->cmd[1] == "map" && m->cmd.size() == 4) {
       int64_t pool = osdmap.lookup_pg_pool_name(m->cmd[2].c_str());
       if (pool < 0) {
@@ -1964,6 +1996,40 @@ bool OSDMonitor::preprocess_command(MMonCommand *m)
       ss << "listed " << osdmap.blacklist.size() << " entries";
       r = 0;
     }
+    else if (m->cmd.size() >= 4 && m->cmd[1] == "crush" && m->cmd[2] == "rule" && (m->cmd[3] == "list" ||
+										   m->cmd[3] == "ls")) {
+      JSONFormatter jf(true);
+      jf.open_array_section("rules");
+      osdmap.crush->list_rules(&jf);
+      jf.close_section();
+      ostringstream rs;
+      jf.flush(rs);
+      rs << "\n";
+      rdata.append(rs.str());
+      r = 0;
+    }
+    else if (m->cmd.size() >= 4 && m->cmd[1] == "crush" && m->cmd[2] == "rule" && m->cmd[3] == "dump") {
+      JSONFormatter jf(true);
+      jf.open_array_section("rules");
+      osdmap.crush->dump_rules(&jf);
+      jf.close_section();
+      ostringstream rs;
+      jf.flush(rs);
+      rs << "\n";
+      rdata.append(rs.str());
+      r = 0;
+    }
+    else if (m->cmd.size() == 3 && m->cmd[1] == "crush" && m->cmd[2] == "dump") {
+      JSONFormatter jf(true);
+      jf.open_object_section("crush_map");
+      osdmap.crush->dump(&jf);
+      jf.close_section();
+      ostringstream rs;
+      jf.flush(rs);
+      rs << "\n";
+      rdata.append(rs.str());
+      r = 0;
+    }
   }
  out:
   if (r != -1) {
@@ -2380,6 +2446,94 @@ bool OSDMonitor::prepare_command(MMonCommand *m)
 	return true;
       }
     }
+    else if (m->cmd.size() == 7 &&
+	     m->cmd[1] == "crush" &&
+	     m->cmd[2] == "rule" &&
+	     m->cmd[3] == "create-simple") {
+      string name = m->cmd[4];
+      string root = m->cmd[5];
+      string type = m->cmd[6];
+
+      if (osdmap.crush->rule_exists(name)) {
+	ss << "rule " << name << " already exists";
+	err = 0;
+	goto out;
+      }
+
+      bufferlist bl;
+      if (pending_inc.crush.length())
+	bl = pending_inc.crush;
+      else
+	osdmap.crush->encode(bl);
+      CrushWrapper newcrush;
+      bufferlist::iterator p = bl.begin();
+      newcrush.decode(p);
+
+      if (newcrush.rule_exists(name)) {
+	ss << "rule " << name << " already exists";
+      } else {
+	int rule = newcrush.add_simple_rule(name, root, type);
+	if (rule < 0) {
+	  err = rule;
+	  goto out;
+	}
+
+	pending_inc.crush.clear();
+	newcrush.encode(pending_inc.crush);
+      }
+      getline(ss, rs);
+      paxos->wait_for_commit(new Monitor::C_Command(mon, m, 0, rs, paxos->get_version()));
+      return true;
+    }
+    else if (m->cmd.size() == 5 &&
+	     m->cmd[1] == "crush" &&
+	     m->cmd[2] == "rule" &&
+	     m->cmd[3] == "rm") {
+      string name = m->cmd[4];
+
+      if (!osdmap.crush->rule_exists(name)) {
+	ss << "rule " << name << " does not exist";
+	err = 0;
+	goto out;
+      }
+
+      bufferlist bl;
+      if (pending_inc.crush.length())
+	bl = pending_inc.crush;
+      else
+	osdmap.crush->encode(bl);
+      CrushWrapper newcrush;
+      bufferlist::iterator p = bl.begin();
+      newcrush.decode(p);
+
+      if (!newcrush.rule_exists(name)) {
+	ss << "rule " << name << " does not exist";
+      } else {
+	int ruleno = newcrush.get_rule_id(name);
+	assert(ruleno >= 0);
+
+	// make sure it is not in use.
+	// FIXME: this is ok in some situations, but let's not bother with that
+	// complexity now.
+	int ruleset = newcrush.get_rule_mask_ruleset(ruleno);
+	if (osdmap.crush_ruleset_in_use(ruleset)) {
+	  ss << "crush rule " << name << " ruleset " << ruleset << " is in use";
+	  err = -EBUSY;
+	  goto out;
+	}
+
+	err = newcrush.remove_rule(ruleno);
+	if (err < 0) {
+	  goto out;
+	}
+
+	pending_inc.crush.clear();
+	newcrush.encode(pending_inc.crush);
+      }
+      getline(ss, rs);
+      paxos->wait_for_commit(new Monitor::C_Command(mon, m, 0, rs, paxos->get_version()));
+      return true;
+    }
     else if (m->cmd[1] == "setmaxosd" && m->cmd.size() > 2) {
       int newmax = parse_pos_long(m->cmd[2].c_str(), &ss);
       if (newmax < 0) {
diff --git a/src/osd/OSDMap.cc b/src/osd/OSDMap.cc
index c7d044ac6fd..6b692d407a8 100644
--- a/src/osd/OSDMap.cc
+++ b/src/osd/OSDMap.cc
@@ -1654,6 +1654,15 @@ void OSDMap::print_summary(ostream& out) const
     out << " nearfull";
 }
 
+bool OSDMap::crush_ruleset_in_use(int ruleset) const
+{
+  for (map<int64_t,pg_pool_t>::const_iterator p = pools.begin(); p != pools.end(); ++p) {
+    if (p->second.crush_ruleset == ruleset)
+      return true;
+  }
+  return false;
+}
+
 void OSDMap::build_simple(CephContext *cct, epoch_t e, uuid_d &fsid,
 			  int nosd, int pg_bits, int pgp_bits)
 {
diff --git a/src/osd/OSDMap.h b/src/osd/OSDMap.h
index d161fa7436b..70ec263e4d8 100644
--- a/src/osd/OSDMap.h
+++ b/src/osd/OSDMap.h
@@ -553,6 +553,7 @@ public:
   static void build_simple_crush_map_from_conf(CephContext *cct, CrushWrapper& crush,
 					       map<int, const char*>& rulesets);
 
+  bool crush_ruleset_in_use(int ruleset) const;
 
 private:
   void print_osd_line(int cur, ostream *out, Formatter *f) const;
diff --git a/src/osd/PG.cc b/src/osd/PG.cc
index 6526633b7d7..fc3f8c24b1f 100644
--- a/src/osd/PG.cc
+++ b/src/osd/PG.cc
@@ -2649,7 +2649,8 @@ void PG::read_state(ObjectStore *store, bufferlist &bl)
   try {
     ostringstream oss;
     read_log(store, coll, log_oid, info, ondisklog, log, missing, oss, this);
-    osd->clog.error() << oss;
+    if (oss.str().length())
+      osd->clog.error() << oss;
   }
   catch (const buffer::error &e) {
     string cr_log_coll_name(get_corrupt_pg_log_name());
diff --git a/src/test/common/Throttle.cc b/src/test/common/Throttle.cc
new file mode 100644
index 00000000000..60d7daebdac
--- /dev/null
+++ b/src/test/common/Throttle.cc
@@ -0,0 +1,256 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com>
+ *
+ * Author: Loic Dachary <loic@dachary.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Library Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Library Public License for more details.
+ *
+ */
+
+#include <stdio.h>
+#include <signal.h>
+#include "common/Mutex.h"
+#include "common/Thread.h"
+#include "common/Throttle.h"
+#include "common/ceph_argparse.h"
+#include "global/global_init.h"
+#include <gtest/gtest.h>
+
+class ThrottleTest : public ::testing::Test {
+protected:
+
+  class Thread_get : public Thread {
+  public:
+    Throttle &throttle;
+    int64_t count;
+    bool waited;
+
+    Thread_get(Throttle& _throttle, int64_t _count) : 
+      throttle(_throttle),
+      count(_count),
+      waited(false)
+    {
+    }
+    
+    virtual void *entry() {
+      waited = throttle.get(count);
+      throttle.put(count);
+      return NULL;
+    }
+  };
+
+};
+
+TEST_F(ThrottleTest, Throttle) {
+  ASSERT_THROW({
+      Throttle throttle(g_ceph_context, "throttle", -1);
+    }, FailedAssertion);
+
+  int64_t throttle_max = 10;
+  Throttle throttle(g_ceph_context, "throttle", throttle_max);
+  ASSERT_EQ(throttle.get_max(), throttle_max);
+  ASSERT_EQ(throttle.get_current(), 0);
+}
+
+TEST_F(ThrottleTest, take) {
+  int64_t throttle_max = 10;
+  Throttle throttle(g_ceph_context, "throttle", throttle_max);
+  ASSERT_THROW(throttle.take(-1), FailedAssertion);
+  ASSERT_EQ(throttle.take(throttle_max), throttle_max);
+  ASSERT_EQ(throttle.take(throttle_max), throttle_max * 2);
+}
+
+TEST_F(ThrottleTest, get) {
+  int64_t throttle_max = 10;
+  Throttle throttle(g_ceph_context, "throttle", throttle_max);
+  ASSERT_THROW(throttle.get(-1), FailedAssertion);
+  ASSERT_FALSE(throttle.get(5)); 
+  ASSERT_EQ(throttle.put(5), 0); 
+
+  ASSERT_FALSE(throttle.get(throttle_max)); 
+  ASSERT_FALSE(throttle.get_or_fail(1)); 
+  ASSERT_FALSE(throttle.get(1, throttle_max + 1)); 
+  ASSERT_EQ(throttle.put(throttle_max + 1), 0); 
+  ASSERT_FALSE(throttle.get(0, throttle_max)); 
+  ASSERT_FALSE(throttle.get(throttle_max)); 
+  ASSERT_FALSE(throttle.get_or_fail(1)); 
+  ASSERT_EQ(throttle.put(throttle_max), 0); 
+
+  useconds_t delay = 1;
+
+  bool waited;
+
+  do {
+    cout << "Trying (1) with delay " << delay << "us\n";
+
+    ASSERT_FALSE(throttle.get(throttle_max)); 
+    ASSERT_FALSE(throttle.get_or_fail(throttle_max));  
+
+    Thread_get t(throttle, 7);
+    t.create();
+    usleep(delay);
+    ASSERT_EQ(throttle.put(throttle_max), 0); 
+    t.join();
+
+    if (!(waited = t.waited))
+      delay *= 2;
+  } while(!waited);
+	  
+  do {
+    cout << "Trying (2) with delay " << delay << "us\n";
+
+    ASSERT_FALSE(throttle.get(throttle_max / 2));
+    ASSERT_FALSE(throttle.get_or_fail(throttle_max));  
+
+    Thread_get t(throttle, throttle_max);
+    t.create();
+    usleep(delay);
+
+    Thread_get u(throttle, 1);
+    u.create();
+    usleep(delay);
+
+    throttle.put(throttle_max / 2);
+
+    t.join();
+    u.join();
+
+    if (!(waited = t.waited && u.waited))
+      delay *= 2;
+  } while(!waited);
+	  
+}
+
+TEST_F(ThrottleTest, get_or_fail) {
+  {
+    Throttle throttle(g_ceph_context, "throttle");
+    
+    ASSERT_TRUE(throttle.get_or_fail(5));
+    ASSERT_TRUE(throttle.get_or_fail(5));
+  }
+
+  {
+    int64_t throttle_max = 10;
+    Throttle throttle(g_ceph_context, "throttle", throttle_max);
+
+    ASSERT_TRUE(throttle.get_or_fail(throttle_max));
+    ASSERT_EQ(throttle.put(throttle_max), 0);
+
+    ASSERT_TRUE(throttle.get_or_fail(throttle_max * 2));
+    ASSERT_FALSE(throttle.get_or_fail(1));
+    ASSERT_FALSE(throttle.get_or_fail(throttle_max * 2));
+    ASSERT_EQ(throttle.put(throttle_max * 2), 0);
+
+    ASSERT_TRUE(throttle.get_or_fail(throttle_max));  
+    ASSERT_FALSE(throttle.get_or_fail(1));  
+    ASSERT_EQ(throttle.put(throttle_max), 0);
+  }
+}
+
+TEST_F(ThrottleTest, wait) {
+  int64_t throttle_max = 10;
+  Throttle throttle(g_ceph_context, "throttle", throttle_max);
+
+  useconds_t delay = 1;
+
+  bool waited;
+
+  do {
+    cout << "Trying (3) with delay " << delay << "us\n";
+
+    ASSERT_FALSE(throttle.get(throttle_max / 2));
+    ASSERT_FALSE(throttle.get_or_fail(throttle_max));  
+
+    Thread_get t(throttle, throttle_max);
+    t.create();
+    usleep(delay);
+
+    //
+    // Throttle::_reset_max(int64_t m) used to contain a test
+    // that blocked the following statement, only if 
+    // the argument was greater than throttle_max. 
+    // Although a value lower than throttle_max would cover
+    // the same code in _reset_max, the throttle_max * 100
+    // value is left here to demonstrate that the problem
+    // has been solved.
+    //
+    throttle.wait(throttle_max * 100);
+    usleep(delay);
+    ASSERT_EQ(throttle.get_current(), throttle_max / 2);
+
+
+    t.join();
+
+    if (!(waited = t.waited)) {
+      delay *= 2;
+      // undo the changes we made
+      throttle.put(throttle_max / 2);
+      throttle.wait(throttle_max);
+    }
+  } while(!waited);
+}
+
+TEST_F(ThrottleTest, destructor) {
+  Thread_get *t;
+  {
+    int64_t throttle_max = 10;
+    Throttle *throttle = new Throttle(g_ceph_context, "throttle", throttle_max);
+
+    ASSERT_FALSE(throttle->get(5)); 
+
+    t = new Thread_get(*throttle, 7);
+    t->create();
+    bool blocked;
+    useconds_t delay = 1;
+    do {
+      usleep(delay);
+      if (throttle->get_or_fail(1)) {
+	throttle->put(1);
+	blocked = false;
+      } else {
+	blocked = true;
+      }
+      delay *= 2;
+    } while(!blocked);
+    delete throttle;
+  }
+  
+  { // 
+    // The thread is left hanging, otherwise it will abort().
+    // Deleting the Throttle on which it is waiting creates a
+    // inconsistency that will be detected: the Throttle object that
+    // it references no longer exists.
+    //
+    pthread_t id = t->get_thread_id();
+    ASSERT_EQ(pthread_kill(id, 0), 0);
+    delete t;
+    ASSERT_EQ(pthread_kill(id, 0), 0);
+  }
+}
+
+int main(int argc, char **argv) {
+  vector<const char*> args;
+  argv_to_vec(argc, (const char **)argv, args);
+
+  global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, CODE_ENVIRONMENT_UTILITY, 0);
+  common_init_finish(g_ceph_context);
+
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+// Local Variables:
+// compile-command: "cd ../.. ; make unittest_throttle ; ./unittest_throttle # --gtest_filter=ThrottleTest.destructor --log-to-stderr=true --debug-filestore=20"
+// End: