Stroika Library 3.0d16
 
Loading...
Searching...
No Matches
BloomFilter.h
1/*
2 * Copyright(c) Sophist Solutions, Inc. 1990-2025. All rights reserved
3 */
4#ifndef _Stroika_Foundation_Cache_BloomFilter_h_
5#define _Stroika_Foundation_Cache_BloomFilter_h_ 1
6
7#include "Stroika/Foundation/StroikaPreComp.h"
8
9#include <type_traits>
10
12#include "Stroika/Foundation/Common/Common.h"
14#include "Stroika/Foundation/Containers/Sequence.h"
15#include "Stroika/Foundation/Containers/Set.h"
16#include "Stroika/Foundation/Cryptography/Digest/Hash.h"
17
18/**
19 * \note Code-Status: <a href="Code-Status.md#Beta">Beta</a>
20 */
21
23
24 /**
25 * \brief a Bloom filter is a probablistic set, which returns either "probably in set" or "definitely not in set"
26 *
27 * Description:
28 * http://en.wikipedia.org/wiki/Bloom_filter
29 * fHashFunction.size () corresponds to 'k'
30 * fBitCount corresponds to 'm'
31 *
32 * \note - when the bloom filter starts getting full, it becomes less useful. A good way to decide
33 * when its time to recreate a larger bloom filter is with:
34 * \code
35 * runToProbOfFalsePositive = 0.5; // ?? depends on your tolerance and cost of resizing bloom filter
36 * if (addressesProbablyUsed.GetStatistics ().ProbabilityOfFalsePositive () >= runToProbOfFalsePositive) {
37 * ... resize bloom filter - means losing all existing data...
38 * \endcode
39 *
40 */
41 template <typename T>
43 public:
44 using HashResultType = uint64_t; // often something smaller will be used, but best to pick a wider type here, and @todo maybe make it a template parameter
45
46 public:
47 /**
48 * \note uses std::function instead of directly storing function-object (losing efficiency) because must maintain
49 * an array of these, and thats trickier if we allow them to have different types.
50 */
51 using HashFunctionType = function<HashResultType (T)>;
52
53 public:
54 /**
55 * \brief kDefaultDesiredFalsePositivityRate is used for constructors given expectedMaxSetSize to compute the appropriate #bits for the BloomFilter or for explicit calls to OptimizeBitSize()
56 */
57 static inline constexpr double kDefaultDesiredFalsePositivityRate = 0.1;
58
59 public:
60 /**
61 * The constructor with an explicit set of hash functions and bitCount uses those hash functions and bitCount.
62 * The constructor taking expectedMaxSetSize calls OptimizeBitSize and OptimizeNumberOfHashFunctions to calculate
63 * optimimum numbers of hash functions and bitstorage for the given expected set size. It also uses DeriveIndependentHashFunctions ()
64 * to automatically create (semi) independent hash functions from the original argument one (this maybe could use tuning/work).
65 *
66 * \note - for the hash functions, you can simple use std::hash<T>, but the hash implementation on gcc (libstdc++) works quite badly
67 * as a crytographic hash for integer types. So the default is to use the Stroika hash function.
68 * You can directly use Cryptography::Digest::Hash<> instead (the default), and this will generaly work better where its
69 * defined.
70 */
71 BloomFilter (const Containers::Sequence<HashFunctionType>& hashFunctions, size_t bitCount);
72 BloomFilter (size_t expectedMaxSetSize, const HashFunctionType& defaultHashFunction = Cryptography::Digest::Hash<T>{},
73 double desiredFalsePositivityRate = kDefaultDesiredFalsePositivityRate);
74 BloomFilter (BloomFilter&& src) noexcept = default;
75 BloomFilter (const BloomFilter& src) = default;
76
77 public:
78 nonvirtual BloomFilter& operator= (BloomFilter&& src) noexcept = default;
79 nonvirtual BloomFilter& operator= (const BloomFilter& src) = default;
80
81 public:
82 static unsigned int OptimizeBitSize (size_t nElements, double desiredFalsePositiveProbability = kDefaultDesiredFalsePositivityRate);
83
84 public:
85 static unsigned int OptimizeNumberOfHashFunctions (size_t setSize, size_t bitSize);
86
87 public:
88 /**
89 * From http://en.wikipedia.org/wiki/Bloom_filter:
90 * (1-exp(-kn/m))^^k
91 *
92 * n elements inserted so far (other letter/vars see struct docs above)
93 *
94 * This formula is not good - see https://tsapps.nist.gov/publication/get_pdf.cfm?pub_id=903775#:~:text=The%20probability%20of%20a%20false,mapped%20into%20the%20Bloom%20filter).
95 * and its especially not good because it should mostly depend on number of bits set, not nElementsInsertedSoFar.
96 * IF/WHEN I can fix that, maybe make this a method of the 'Statistics' object.
97 *
98 * See comments in GetFractionFull() about alternative better possible approach.
99 */
101 nonvirtual double ProbabilityOfFalsePositive (size_t nElementsInsertedSoFar) const;
102
103 public:
105
106 public:
107 /**
108 * return true if this was probably a change, and false if it was probably already present
109 */
110 nonvirtual bool Add (Common::ArgByValueType<T> elt);
111
112 public:
113 /**
114 */
115 nonvirtual void clear ();
116
117 public:
118 /**
119 * \brief returns true if this (probably) Contains 'elt' - and false otherwise.
120 *
121 * False positive retrieval results are possible, but false negatives are not;
122 *
123 * Call GetEffectiveOptions ().ProbabilityOfFalsePositive (nEltsAdded) to an estimate of the probability of false positives.
124 */
125 nonvirtual bool ProbablyContains (Common::ArgByValueType<T> elt) const;
126
127 [[deprecated ("Since Stroika v3.0d10 use ProbablyContains")]] bool Contains (Common::ArgByValueType<T> elt) const
128 {
129 return ProbablyContains (elt);
130 }
131
132 public:
133 struct Statistics;
134
135 public:
136 /**
137 */
138 nonvirtual Statistics GetStatistics () const;
139
140 private:
141 vector<HashFunctionType> fHashFunctions_;
142 vector<bool> fBits_; // @todo use Set_Bitstring
143 unsigned int fActualAddCalls_{}; // to support useful statistics, not strictly needed
144 unsigned int fApparentlyDistinctAddCalls_{}; // ""
145 };
146
147 /**
148 */
149 template <typename T>
150 struct BloomFilter<T>::Statistics {
151 size_t fHashFunctions;
152 size_t fBitCount;
153 size_t fBitsSet;
154 unsigned int fActualAddCalls;
155 unsigned int fApparentlyDistinctAddCalls;
156
157 public:
158 /**
159 * \brief return number from 0..1 indicating 'percent' of the bloom filter full so its clear when to refresh/resize it (or at least when its going to start getting lots of false positives)
160 *
161 * @todo see comments on ProbabilityOfFalsePositive () - should be able to compute ProbabilityOfFalsePositive based just on this info
162 * (roughly pct bits unset / number of hash functions). But I cannot find a reference for that so leaving this as is for now.
163 */
164 nonvirtual double GetFractionFull () const;
165
166 public:
167 /**
168 */
169 nonvirtual double ProbabilityOfFalsePositive () const;
170
171 public:
172 /**
173 * @see Characters::ToString ()
174 */
175 nonvirtual Characters::String ToString () const;
176 };
177
178}
179
180/*
181 ********************************************************************************
182 ***************************** Implementation Details ***************************
183 ********************************************************************************
184 */
185#include "BloomFilter.inl"
186
187#endif /*_Stroika_Foundation_Cache_BloomFilter_h_ */
a Bloom filter is a probablistic set, which returns either "probably in set" or "definitely not in se...
Definition BloomFilter.h:42
static constexpr double kDefaultDesiredFalsePositivityRate
kDefaultDesiredFalsePositivityRate is used for constructors given expectedMaxSetSize to compute the a...
Definition BloomFilter.h:57
nonvirtual bool ProbablyContains(Common::ArgByValueType< T > elt) const
returns true if this (probably) Contains 'elt' - and false otherwise.
nonvirtual bool Add(Common::ArgByValueType< T > elt)
static double ProbabilityOfFalsePositive(size_t hashFunctionCount, size_t bitCount, size_t nElementsInsertedSoFar)
static Containers::Sequence< HashFunctionType > DeriveIndependentHashFunctions(const HashFunctionType &h, size_t repeatCount)
function< HashResultType(T)> HashFunctionType
Definition BloomFilter.h:51
LRUCache implements a simple least-recently-used caching strategy, with optional hashing (of keys) to...
Definition LRUCache.h:94
A generalization of a vector: a container whose elements are keyed by the natural numbers.
Definition Sequence.h:187
STRING_TYPE ToString(FLOAT_TYPE f, const ToStringOptions &options={})
conditional_t<(sizeof(CHECK_T)<=2 *sizeof(void *)) and is_trivially_copyable_v< CHECK_T >, CHECK_T, const CHECK_T & > ArgByValueType
This is an alias for 'T' - but how we want to pass it on stack as formal parameter.
Definition TypeHints.h:32
Simple wrapper on (cryptographic) digest algorithms with fewer knobs, and easier construcion- mimicin...
Definition Hash.h:155