diff options
-rw-r--r-- | src/common/bloom_filter.hpp | 25 | ||||
-rw-r--r-- | src/test/common/test_bloom_filter.cc | 5 |
2 files changed, 28 insertions, 2 deletions
diff --git a/src/common/bloom_filter.hpp b/src/common/bloom_filter.hpp index c336eef8b6a..4b711526676 100644 --- a/src/common/bloom_filter.hpp +++ b/src/common/bloom_filter.hpp @@ -296,6 +296,31 @@ public: return insert_count_; } + /* + * density of bits set. inconvenient units, but: + * .3 = ~50% target insertions + * .5 = 100% target insertions, "perfectly full" + * .75 = 200% target insertions + * 1.0 = all bits set... infinite insertions + */ + inline double density() const + { + size_t set = 0; + uint8_t *p = bit_table_; + size_t left = table_size_; + while (left-- > 0) { + uint8_t c = *p; + for (; c; ++set) + c &= c - 1; + ++p; + } + return (double)set / (double)(table_size_ << 3); + } + + inline double approx_unique_element_count() const { + // this is not a very good estimate; a better solution should have + // some asymptotic behavior as density() approaches 1.0. + return (double)target_element_count_ * 2.0 * density(); } inline double effective_fpp() const diff --git a/src/test/common/test_bloom_filter.cc b/src/test/common/test_bloom_filter.cc index 8e3661b2cc1..5edf3f0f9fa 100644 --- a/src/test/common/test_bloom_filter.cc +++ b/src/test/common/test_bloom_filter.cc @@ -62,7 +62,7 @@ TEST(BloomFilter, Sweep) { } TEST(BloomFilter, SweepInt) { - std::cout << "# max\tfpp\tactual\tsize\tB/insert" << std::endl; + std::cout << "# max\tfpp\tactual\tsize\tB/insert\tdensity\tapprox_element_count" << std::endl; for (int ex = 3; ex < 12; ex += 2) { for (float fpp = .001; fpp < .5; fpp *= 4.0) { int max = 2 << ex; @@ -92,7 +92,8 @@ TEST(BloomFilter, SweepInt) { double byte_per_insert = (double)bl.length() / (double)max; - std::cout << max << "\t" << fpp << "\t" << actual << "\t" << bl.length() << "\t" << byte_per_insert << std::endl; + std::cout << max << "\t" << fpp << "\t" << actual << "\t" << bl.length() << "\t" << byte_per_insert + << "\t" << bf.density() << "\t" << bf.approx_unique_element_count() << std::endl; ASSERT_TRUE(actual < fpp * 10); ASSERT_TRUE(actual > fpp / 10); } |