Stroika Library 3.0d16
 
Loading...
Searching...
No Matches
BloomFilter.inl
1/*
2 * Copyright(c) Sophist Solutions, Inc. 1990-2025. All rights reserved
3 */
4#include <cmath>
5
8#include "Stroika/Foundation/Execution/Exceptions.h"
9#include "Stroika/Foundation/Math/Common.h"
10
12
13 /*
14 ********************************************************************************
15 ********************** Cache::BloomFilter<T>::Statistics ***********************
16 ********************************************************************************
17 */
18 template <typename T>
19 inline double BloomFilter<T>::Statistics::GetFractionFull () const
20 {
21 return static_cast<double> (fBitsSet) / static_cast<double> (fBitCount);
22 }
23 template <typename T>
24 inline double BloomFilter<T>::Statistics::ProbabilityOfFalsePositive () const
25 {
26 return BloomFilter<T>::ProbabilityOfFalsePositive (fHashFunctions, fBitCount, fApparentlyDistinctAddCalls);
27 }
28 template <typename T>
29 Characters::String BloomFilter<T>::Statistics::ToString () const
30 {
31 Characters::StringBuilder sb;
32 sb << "{"sv;
33 sb << "HashFunctions: "sv << fHashFunctions;
34 sb << ", BitCount: "sv << fBitCount;
35 sb << ", BitsSet: "sv << fBitsSet;
36 sb << ", ActualAddCalls: "sv << fActualAddCalls;
37 sb << ", ApparentlyDistinctAddCalls: "sv << fApparentlyDistinctAddCalls;
38 sb << ", ProbabilityOfFalsePositive: "sv << ProbabilityOfFalsePositive ();
39 sb << "}"sv;
40 return sb;
41 }
42
43 /*
44 ********************************************************************************
45 ****************************** Cache::BloomFilter ******************************
46 ********************************************************************************
47 */
48 template <typename T>
49 inline BloomFilter<T>::BloomFilter (const Containers::Sequence<HashFunctionType>& hashFunctions, size_t bitCount)
50 : fHashFunctions_{hashFunctions.template As<vector<HashFunctionType>> ()}
51 {
52 Require (fHashFunctions_.size () >= 1);
53 Require (bitCount >= 1);
54 fBits_.resize (bitCount, false);
55 }
56 template <typename T>
58 {
59 size_t bitCount = OptimizeBitSize (expectedMaxSetSize, desiredFalsePositivityRate);
60 size_t hashFunctionCnt = OptimizeNumberOfHashFunctions (expectedMaxSetSize, bitCount);
61 fBits_.resize (bitCount, false);
62 fHashFunctions_ = DeriveIndependentHashFunctions (defaultHashFunction, hashFunctionCnt).template As<vector<HashFunctionType>> ();
63 }
64 template <typename T>
65 inline unsigned int BloomFilter<T>::OptimizeBitSize (size_t nElements, double desiredFalsePositiveProbability)
66 {
67 // based on https://en.wikipedia.org/wiki/Bloom_filter (approximate)
68 return static_cast<unsigned int> (::ceil (-static_cast<double> (nElements) * log (desiredFalsePositiveProbability) / log (2) * log (2)));
69 }
70 template <typename T>
71 inline unsigned int BloomFilter<T>::OptimizeNumberOfHashFunctions (size_t setSize, size_t bitSize)
72 {
73 // based on https://en.wikipedia.org/wiki/Bloom_filter - (m/n)*ln(2)
74 return static_cast<unsigned int> (::ceil ((double (bitSize) / setSize) * log (2)));
75 }
76 template <typename T>
78 {
79 // From https://en.wikipedia.org/wiki/Bloom_filter
80 return ::pow (1 - ::exp (-static_cast<double> (hashFunctionCount * nElementsInsertedSoFar) / bitCount), hashFunctionCount);
81 }
82 template <typename T>
84 {
85 return ProbabilityOfFalsePositive (fHashFunctions_.size (), fBits_.size (), nElementsInsertedSoFar);
86 }
87 template <typename T>
89 {
90 Require (repeatCount >= 1);
91 /**
92 * From https://en.wikipedia.org/wiki/Bloom_filter
93 *
94 * The requirement of designing k different independent hash functions can be prohibitive
95 * for large k. For a good hash function with a wide output, there should be little if any
96 * correlation between different bit-fields of such a hash, so this type of hash can be
97 * used to generate multiple "different" hash functions by slicing its output into multiple
98 * bit fields. Alternatively, one can pass k different initial values (such as 0, 1, ..., k − 1)
99 * to a hash function that takes an initial value; or add (or append) these values to the key
100 *
101 * This trick here - is something similar to the suggestions in the wiki article - deriving
102 * a hash function by combining a unique seed (by hashing a constant) and combining that with the
103 * has result of the original function.
104 */
106 for (size_t i = 1; i < repeatCount; ++i) {
108 result += [=] (const T& t) { return Cryptography::Digest::HashValueCombine (h (t), seed); };
109 }
110 return result;
111 }
112 template <typename T>
114 {
115 size_t sz{fBits_.size ()};
116 bool apparentlyNewAddition = false;
117 for (const HashFunctionType& f : fHashFunctions_) {
118 size_t idx = static_cast<size_t> (f (elt) % sz);
119 if (not fBits_[idx]) {
121 }
122 fBits_[idx] = true;
123 }
124 ++fActualAddCalls_;
126 ++fApparentlyDistinctAddCalls_;
127 }
129 }
130 template <typename T>
131 inline void BloomFilter<T>::clear ()
132 {
133 fBits_.clear ();
134 }
135 template <typename T>
137 {
138 size_t sz{fBits_.size ()};
139 for (const HashFunctionType& f : fHashFunctions_) {
140 if (not fBits_[f (elt) % sz]) {
141 return false;
142 }
143 }
144 return true;
145 }
146 template <typename T>
147 auto BloomFilter<T>::GetStatistics () const -> Statistics
148 {
149 size_t nTrue{};
150 for (bool i : fBits_) {
151 if (i) {
152 ++nTrue;
153 }
154 }
155 return Statistics{fHashFunctions_.size (), fBits_.size (), nTrue, fActualAddCalls_, fApparentlyDistinctAddCalls_};
156 }
157
158}
a Bloom filter is a probablistic set, which returns either "probably in set" or "definitely not in se...
Definition BloomFilter.h:42
BloomFilter(const Containers::Sequence< HashFunctionType > &hashFunctions, size_t bitCount)
nonvirtual bool ProbablyContains(Common::ArgByValueType< T > elt) const
returns true if this (probably) Contains 'elt' - and false otherwise.
nonvirtual bool Add(Common::ArgByValueType< T > elt)
static double ProbabilityOfFalsePositive(size_t hashFunctionCount, size_t bitCount, size_t nElementsInsertedSoFar)
static Containers::Sequence< HashFunctionType > DeriveIndependentHashFunctions(const HashFunctionType &h, size_t repeatCount)
function< HashResultType(T)> HashFunctionType
Definition BloomFilter.h:51
LRUCache implements a simple least-recently-used caching strategy, with optional hashing (of keys) to...
Definition LRUCache.h:94
A generalization of a vector: a container whose elements are keyed by the natural numbers.
Definition Sequence.h:187
conditional_t<(sizeof(CHECK_T)<=2 *sizeof(void *)) and is_trivially_copyable_v< CHECK_T >, CHECK_T, const CHECK_T & > ArgByValueType
This is an alias for 'T' - but how we want to pass it on stack as formal parameter.
Definition TypeHints.h:32
Simple wrapper on (cryptographic) digest algorithms with fewer knobs, and easier construcion- mimicin...
Definition Hash.h:155