8#include "Stroika/Foundation/Execution/Exceptions.h"
9#include "Stroika/Foundation/Math/Common.h"
19 inline double BloomFilter<T>::Statistics::GetFractionFull ()
const
21 return static_cast<double> (fBitsSet) /
static_cast<double> (fBitCount);
24 inline double BloomFilter<T>::Statistics::ProbabilityOfFalsePositive ()
const
29 Characters::String BloomFilter<T>::Statistics::ToString ()
const
31 Characters::StringBuilder sb;
33 sb <<
"HashFunctions: "sv << fHashFunctions;
34 sb <<
", BitCount: "sv << fBitCount;
35 sb <<
", BitsSet: "sv << fBitsSet;
36 sb <<
", ActualAddCalls: "sv << fActualAddCalls;
37 sb <<
", ApparentlyDistinctAddCalls: "sv << fApparentlyDistinctAddCalls;
38 sb <<
", ProbabilityOfFalsePositive: "sv << ProbabilityOfFalsePositive ();
52 Require (fHashFunctions_.size () >= 1);
53 Require (bitCount >= 1);
54 fBits_.resize (bitCount,
false);
57 BloomFilter<T>::BloomFilter (
size_t expectedMaxSetSize,
const HashFunctionType& defaultHashFunction,
double desiredFalsePositivityRate)
59 size_t bitCount = OptimizeBitSize (expectedMaxSetSize, desiredFalsePositivityRate);
60 size_t hashFunctionCnt = OptimizeNumberOfHashFunctions (expectedMaxSetSize, bitCount);
61 fBits_.resize (bitCount,
false);
62 fHashFunctions_ = DeriveIndependentHashFunctions (defaultHashFunction, hashFunctionCnt).template As<vector<HashFunctionType>> ();
65 inline unsigned int BloomFilter<T>::OptimizeBitSize (
size_t nElements,
double desiredFalsePositiveProbability)
68 return static_cast<unsigned int> (::ceil (-
static_cast<double> (nElements) * log (desiredFalsePositiveProbability) / log (2) * log (2)));
71 inline unsigned int BloomFilter<T>::OptimizeNumberOfHashFunctions (
size_t setSize,
size_t bitSize)
74 return static_cast<unsigned int> (::ceil ((
double (bitSize) / setSize) * log (2)));
80 return ::pow (1 - ::exp (-
static_cast<double> (hashFunctionCount * nElementsInsertedSoFar) / bitCount), hashFunctionCount);
85 return ProbabilityOfFalsePositive (fHashFunctions_.size (), fBits_.size (), nElementsInsertedSoFar);
90 Require (repeatCount >= 1);
106 for (
size_t i = 1; i < repeatCount; ++i) {
108 result += [=] (
const T& t) {
return Cryptography::Digest::HashValueCombine (h (t), seed); };
112 template <
typename T>
115 size_t sz{fBits_.size ()};
116 bool apparentlyNewAddition =
false;
118 size_t idx =
static_cast<size_t> (f (elt) % sz);
119 if (not fBits_[idx]) {
120 apparentlyNewAddition =
true;
125 if (apparentlyNewAddition) {
126 ++fApparentlyDistinctAddCalls_;
128 return apparentlyNewAddition;
130 template <
typename T>
135 template <
typename T>
138 size_t sz{fBits_.size ()};
140 if (not fBits_[f (elt) % sz]) {
146 template <
typename T>
150 for (
bool i : fBits_) {
155 return Statistics{fHashFunctions_.size (), fBits_.size (), nTrue, fActualAddCalls_, fApparentlyDistinctAddCalls_};
a Bloom filter is a probablistic set, which returns either "probably in set" or "definitely not in se...
BloomFilter(const Containers::Sequence< HashFunctionType > &hashFunctions, size_t bitCount)
nonvirtual bool ProbablyContains(Common::ArgByValueType< T > elt) const
returns true if this (probably) Contains 'elt' - and false otherwise.
nonvirtual bool Add(Common::ArgByValueType< T > elt)
static double ProbabilityOfFalsePositive(size_t hashFunctionCount, size_t bitCount, size_t nElementsInsertedSoFar)
static Containers::Sequence< HashFunctionType > DeriveIndependentHashFunctions(const HashFunctionType &h, size_t repeatCount)
function< HashResultType(T)> HashFunctionType
A generalization of a vector: a container whose elements are keyed by the natural numbers.
conditional_t<(sizeof(CHECK_T)<=2 *sizeof(void *)) and is_trivially_copyable_v< CHECK_T >, CHECK_T, const CHECK_T & > ArgByValueType
This is an alias for 'T' - but how we want to pass it on stack as formal parameter.
Simple wrapper on (cryptographic) digest algorithms with fewer knobs, and easier construcion- mimicin...