Stroika Library 3.0d23
 
Loading...
Searching...
No Matches
BloomFilter.h
1/*
2 * Copyright(c) Sophist Solutions, Inc. 1990-2026. All rights reserved
3 */
4#ifndef _Stroika_Foundation_Cache_BloomFilter_h_
5#define _Stroika_Foundation_Cache_BloomFilter_h_ 1
6
7#include "Stroika/Foundation/StroikaPreComp.h"
8
9#include <type_traits>
10
12#include "Stroika/Foundation/Common/Common.h"
14#include "Stroika/Foundation/Containers/Sequence.h"
15#include "Stroika/Foundation/Containers/Set.h"
16#include "Stroika/Foundation/Cryptography/Digest/Hash.h"
17
18/**
19 * \note Code-Status: <a href="Code-Status.md#Beta">Beta</a>
20 */
21
23
24 /**
25 * \brief a Bloom filter is a probablistic set, which returns either "probably in set" or "definitely not in set"
26 *
27 * Description:
28 * http://en.wikipedia.org/wiki/Bloom_filter
29 * fHashFunction.size () corresponds to 'k'
30 * fBitCount corresponds to 'm'
31 *
32 * \note - when the bloom filter starts getting full, it becomes less useful. A good way to decide
33 * when its time to recreate a larger bloom filter is with:
34 * \code
35 * runToProbOfFalsePositive = 0.5; // ?? depends on your tolerance and cost of resizing bloom filter
36 * if (addressesProbablyUsed.GetStatistics ().ProbabilityOfFalsePositive () >= runToProbOfFalsePositive) {
37 * ... resize bloom filter - means losing all existing data...
38 * \endcode
39 *
40 * \note BloomFilter does NOT support ICache<> API, due to not having a Lookup () function - because it
41 * can generate false positives. MAY want to reconsider this stand???? --LGP 2026-03-27
42 */
43 template <typename T>
45 public:
46 using HashResultType = uint64_t; // often something smaller will be used, but best to pick a wider type here, and @todo maybe make it a template parameter
47
48 public:
49 /**
50 * \note uses std::function instead of directly storing function-object (losing efficiency) because must maintain
51 * an array of these, and thats trickier if we allow them to have different types.
52 */
53 using HashFunctionType = function<HashResultType (T)>;
54
55 public:
56 /**
57 * \brief kDefaultDesiredFalsePositivityRate is used for constructors given expectedMaxSetSize to compute the appropriate #bits for the BloomFilter or for explicit calls to OptimizeBitSize()
58 */
59 static inline constexpr double kDefaultDesiredFalsePositivityRate = 0.1;
60
61 public:
62 /**
63 * The constructor with an explicit set of hash functions and bitCount uses those hash functions and bitCount.
64 * The constructor taking expectedMaxSetSize calls OptimizeBitSize and OptimizeNumberOfHashFunctions to calculate
65 * optimimum numbers of hash functions and bitstorage for the given expected set size. It also uses DeriveIndependentHashFunctions ()
66 * to automatically create (semi) independent hash functions from the original argument one (this maybe could use tuning/work).
67 *
68 * \note - for the hash functions, you can simple use std::hash<T>, but the hash implementation on gcc (libstdc++) works quite badly
69 * as a crytographic hash for integer types. So the default is to use the Stroika hash function.
70 * You can directly use Cryptography::Digest::Hash<> instead (the default), and this will generaly work better where its
71 * defined.
72 */
73 BloomFilter (const Containers::Sequence<HashFunctionType>& hashFunctions, size_t bitCount);
74 BloomFilter (size_t expectedMaxSetSize, const HashFunctionType& defaultHashFunction = Cryptography::Digest::Hash<T>{},
75 double desiredFalsePositivityRate = kDefaultDesiredFalsePositivityRate);
76 BloomFilter (BloomFilter&& src) noexcept = default;
77 BloomFilter (const BloomFilter& src) = default;
78
79 public:
80 nonvirtual BloomFilter& operator= (BloomFilter&& src) noexcept = default;
81 nonvirtual BloomFilter& operator= (const BloomFilter& src) = default;
82
83 public:
84 static unsigned int OptimizeBitSize (size_t nElements, double desiredFalsePositiveProbability = kDefaultDesiredFalsePositivityRate);
85
86 public:
87 static unsigned int OptimizeNumberOfHashFunctions (size_t setSize, size_t bitSize);
88
89 public:
90 /**
91 * From http://en.wikipedia.org/wiki/Bloom_filter:
92 * (1-exp(-kn/m))^^k
93 *
94 * n elements inserted so far (other letter/vars see struct docs above)
95 *
96 * This formula is not good - see https://tsapps.nist.gov/publication/get_pdf.cfm?pub_id=903775#:~:text=The%20probability%20of%20a%20false,mapped%20into%20the%20Bloom%20filter).
97 * and its especially not good because it should mostly depend on number of bits set, not nElementsInsertedSoFar.
98 * IF/WHEN I can fix that, maybe make this a method of the 'Statistics' object.
99 *
100 * See comments in GetFractionFull() about alternative better possible approach.
101 */
102 static double ProbabilityOfFalsePositive (size_t hashFunctionCount, size_t bitCount, size_t nElementsInsertedSoFar);
103 nonvirtual double ProbabilityOfFalsePositive (size_t nElementsInsertedSoFar) const;
104
105 public:
107
108 public:
109 /**
110 * return true if this was probably a change, and false if it was probably already present
111 */
112 nonvirtual bool Add (Common::ArgByValueType<T> elt);
113
114 public:
115 /**
116 */
117 nonvirtual void clear ();
118
119 public:
120 /**
121 * \brief returns true if this (probably) Contains 'elt' - and false otherwise.
122 *
123 * False positive retrieval results are possible, but false negatives are not;
124 *
125 * Call GetEffectiveOptions ().ProbabilityOfFalsePositive (nEltsAdded) to an estimate of the probability of false positives.
126 */
127 nonvirtual bool ProbablyContains (Common::ArgByValueType<T> elt) const;
128
129 [[deprecated ("Since Stroika v3.0d10 use ProbablyContains")]] bool Contains (Common::ArgByValueType<T> elt) const
130 {
131 return ProbablyContains (elt);
132 }
133
134 public:
135 struct Statistics;
136
137 public:
138 /**
139 */
140 nonvirtual Statistics GetStatistics () const;
141
142 private:
143 vector<HashFunctionType> fHashFunctions_;
144 vector<bool> fBits_; // @todo use Set_Bitstring
145 unsigned int fActualAddCalls_{}; // to support useful statistics, not strictly needed
146 unsigned int fApparentlyDistinctAddCalls_{}; // ""
147 };
148
149 /**
150 */
151 template <typename T>
152 struct BloomFilter<T>::Statistics {
153 size_t fHashFunctions;
154 size_t fBitCount;
155 size_t fBitsSet;
156 unsigned int fActualAddCalls;
157 unsigned int fApparentlyDistinctAddCalls;
158
159 public:
160 /**
161 * \brief return number from 0..1 indicating 'percent' of the bloom filter full so its clear when to refresh/resize it (or at least when its going to start getting lots of false positives)
162 *
163 * @todo see comments on ProbabilityOfFalsePositive () - should be able to compute ProbabilityOfFalsePositive based just on this info
164 * (roughly pct bits unset / number of hash functions). But I cannot find a reference for that so leaving this as is for now.
165 */
166 nonvirtual double GetFractionFull () const;
167
168 public:
169 /**
170 */
171 nonvirtual double ProbabilityOfFalsePositive () const;
172
173 public:
174 /**
175 * @see Characters::ToString ()
176 */
177 nonvirtual Characters::String ToString () const;
178 };
179
180}
181
182/*
183 ********************************************************************************
184 ***************************** Implementation Details ***************************
185 ********************************************************************************
186 */
187#include "BloomFilter.inl"
188
189#endif /*_Stroika_Foundation_Cache_BloomFilter_h_ */
a Bloom filter is a probablistic set, which returns either "probably in set" or "definitely not in se...
Definition BloomFilter.h:44
static constexpr double kDefaultDesiredFalsePositivityRate
kDefaultDesiredFalsePositivityRate is used for constructors given expectedMaxSetSize to compute the a...
Definition BloomFilter.h:59
nonvirtual bool ProbablyContains(Common::ArgByValueType< T > elt) const
returns true if this (probably) Contains 'elt' - and false otherwise.
nonvirtual bool Add(Common::ArgByValueType< T > elt)
static double ProbabilityOfFalsePositive(size_t hashFunctionCount, size_t bitCount, size_t nElementsInsertedSoFar)
static Containers::Sequence< HashFunctionType > DeriveIndependentHashFunctions(const HashFunctionType &h, size_t repeatCount)
function< HashResultType(T)> HashFunctionType
Definition BloomFilter.h:53
A generalization of a vector: a container whose elements are keyed by the natural numbers.
STRING_TYPE ToString(FLOAT_TYPE f, const ToStringOptions &options={})
conditional_t<(sizeof(CHECK_T)<=2 *sizeof(void *)) and is_trivially_copyable_v< CHECK_T >, CHECK_T, const CHECK_T & > ArgByValueType
This is an alias for 'T' - but how we want to pass it on stack as formal parameter.
Definition TypeHints.h:36
Simple wrapper on (cryptographic) digest algorithms with fewer knobs, and easier construcion- mimicin...
Definition Hash.h:155