SeqAn3 3.4.0-rc.4
The Modern C++ library for sequence analysis.
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages Concepts
bloom_filter.hpp
Go to the documentation of this file.
1// SPDX-FileCopyrightText: 2006-2025 Knut Reinert & Freie Universität Berlin
2// SPDX-FileCopyrightText: 2016-2025 Knut Reinert & MPI für molekulare Genetik
3// SPDX-License-Identifier: BSD-3-Clause
4
10#pragma once
11
13
14namespace seqan3
15{
16
80template <data_layout data_layout_mode_ = data_layout::uncompressed>
82{
83private:
85 template <data_layout data_layout_mode>
86 friend class bloom_filter;
88
90 using data_type = std::conditional_t<data_layout_mode_ == data_layout::uncompressed,
91 seqan3::contrib::sdsl::bit_vector,
92 seqan3::contrib::sdsl::sd_vector<>>;
93
95 size_t size_in_bits{};
97 size_t hash_shift{};
99 size_t hash_funs{};
101 data_type data{};
103 static constexpr std::array<size_t, 5> hash_seeds{13'572'355'802'537'770'549ULL, // 2**64 / (e/2)
104 13'043'817'825'332'782'213ULL, // 2**64 / sqrt(2)
105 10'650'232'656'628'343'401ULL, // 2**64 / sqrt(3)
106 16'499'269'484'942'379'435ULL, // 2**64 / (sqrt(5)/2)
107 4'893'150'838'803'335'377ULL}; // 2**64 / (3*pi/5)
108
116 inline constexpr size_t hash_and_fit(size_t h, size_t const seed) const
117 {
118 h *= seed;
119 h ^= h >> hash_shift; // XOR and shift higher bits into lower bits
120 h *= 11'400'714'819'323'198'485ULL; // = 2^64 / golden_ration, to expand h to 64 bit range
121 // Use fastrange (integer modulo without division) if possible.
122#ifdef __SIZEOF_INT128__
123 h = static_cast<uint64_t>((static_cast<__uint128_t>(h) * static_cast<__uint128_t>(size_in_bits)) >> 64);
124#else
125 h %= size_in_bits;
126#endif
127 return h;
128 }
129
130public:
132 static constexpr data_layout data_layout_mode = data_layout_mode_;
133
137 bloom_filter() = default;
138 bloom_filter(bloom_filter const &) = default;
139 bloom_filter & operator=(bloom_filter const &) = default;
142 ~bloom_filter() = default;
143
158 {
159 size_in_bits = size.get();
160 hash_funs = funs.get();
161
162 if (hash_funs == 0 || hash_funs > 5)
163 throw std::logic_error{"The number of hash functions must be > 0 and <= 5."};
164 if (size_in_bits == 0)
165 throw std::logic_error{"The size of a bloom filter must be > 0."};
166
167 hash_shift = std::countl_zero(size_in_bits);
168 data = seqan3::contrib::sdsl::bit_vector(size_in_bits);
169 }
170
184 {
185 std::tie(size_in_bits, hash_shift, hash_funs) = std::tie(bf.size_in_bits, bf.hash_shift, bf.hash_funs);
186
187 data = seqan3::contrib::sdsl::sd_vector<>{bf.data};
188 }
190
205 void emplace(size_t const value) noexcept
207 {
208 for (size_t i = 0; i < hash_funs; ++i)
209 {
210 size_t idx = hash_and_fit(value, hash_seeds[i]);
211 assert(idx < data.size());
212 data[idx] = 1;
213 };
214 }
215
228 void reset() noexcept
230 {
231 seqan3::contrib::sdsl::util::_set_zero_bits(data);
232 }
234
249 bool contains(size_t const value) const noexcept
250 {
251 for (size_t i = 0; i < hash_funs; i++)
252 {
253 size_t idx = hash_and_fit(value, hash_seeds[i]);
254 assert(idx < data.size());
255 if (data[idx] == 0)
256 return false;
257 }
258 return true;
259 }
261
280 template <std::ranges::range value_range_t>
281 size_t count(value_range_t && values) const noexcept
282 {
283 static_assert(std::ranges::input_range<value_range_t>, "The values must model input_range.");
284 static_assert(std::unsigned_integral<std::ranges::range_value_t<value_range_t>>,
285 "An individual value must be an unsigned integral.");
286
287 size_t result = 0;
288
289 for (auto && value : values)
290 result += contains(value);
291
292 return result;
293 }
295
302 size_t hash_function_count() const noexcept
303 {
304 return hash_funs;
305 }
306
310 size_t bit_size() const noexcept
311 {
312 return size_in_bits;
313 }
315
324 friend bool operator==(bloom_filter const & lhs, bloom_filter const & rhs) noexcept
325 {
326 return std::tie(lhs.size_in_bits, lhs.hash_shift, lhs.hash_funs, lhs.data)
327 == std::tie(rhs.size_in_bits, rhs.hash_shift, rhs.hash_funs, rhs.data);
328 }
329
335 friend bool operator!=(bloom_filter const & lhs, bloom_filter const & rhs) noexcept
336 {
337 return !(lhs == rhs);
338 }
340
351 constexpr data_type & raw_data() noexcept
352 {
353 return data;
354 }
355
357 constexpr data_type const & raw_data() const noexcept
358 {
359 return data;
360 }
362
370 template <cereal_archive archive_t>
371 void CEREAL_SERIALIZE_FUNCTION_NAME(archive_t & archive)
372 {
373 archive(size_in_bits);
374 archive(hash_shift);
375 archive(hash_funs);
376 archive(data);
377 }
379};
380
381} // namespace seqan3
The Bloom Filter. A data structure that efficiently answers set-membership queries.
Definition bloom_filter.hpp:82
friend bool operator!=(bloom_filter const &lhs, bloom_filter const &rhs) noexcept
Test for inequality.
Definition bloom_filter.hpp:335
constexpr data_type & raw_data() noexcept
Provides direct, unsafe access to the underlying data structure.
Definition bloom_filter.hpp:351
bloom_filter(bloom_filter &&)=default
Defaulted.
static constexpr data_layout data_layout_mode
Indicates whether the Bloom Filter is compressed.
Definition bloom_filter.hpp:132
constexpr data_type const & raw_data() const noexcept
Provides direct, unsafe access to the underlying data structure.
Definition bloom_filter.hpp:357
bloom_filter & operator=(bloom_filter const &)=default
Defaulted.
bool contains(size_t const value) const noexcept
Check whether a value is present in the Bloom Filter.
Definition bloom_filter.hpp:249
bloom_filter(bloom_filter< data_layout::uncompressed > const &bf)
Construct a compressed Bloom Filter.
Definition bloom_filter.hpp:182
friend bool operator==(bloom_filter const &lhs, bloom_filter const &rhs) noexcept
Test for equality.
Definition bloom_filter.hpp:324
void reset() noexcept
Remove all values from the Bloom Filter by setting all bits to 0.
Definition bloom_filter.hpp:228
bloom_filter(bloom_filter const &)=default
Defaulted.
size_t hash_function_count() const noexcept
Returns the number of hash functions used in the Bloom Filter.
Definition bloom_filter.hpp:302
size_t count(value_range_t &&values) const noexcept
Counts the occurrences for all values in a range.
Definition bloom_filter.hpp:281
bloom_filter()=default
Defaulted.
size_t bit_size() const noexcept
Returns the size of the underlying bitvector.
Definition bloom_filter.hpp:310
bloom_filter & operator=(bloom_filter &&)=default
Defaulted.
void emplace(size_t const value) noexcept
Inserts a value into the Bloom Filter.
Definition bloom_filter.hpp:205
~bloom_filter()=default
Defaulted.
bloom_filter(seqan3::bin_size size, seqan3::hash_function_count funs=seqan3::hash_function_count{2u})
Construct an uncompressed Bloom Filter.
Definition bloom_filter.hpp:156
T countl_zero(T... args)
data_layout
Determines if the Interleaved Bloom Filter is compressed.
Definition interleaved_bloom_filter.hpp:24
@ uncompressed
The Interleaved Bloom Filter is uncompressed.
Definition interleaved_bloom_filter.hpp:25
@ compressed
The Interleaved Bloom Filter is compressed.
Definition interleaved_bloom_filter.hpp:26
Provides seqan3::interleaved_bloom_filter.
The main SeqAn3 namespace.
Definition aligned_sequence_concept.hpp:26
A strong type that represents the number of bits for each bin in the seqan3::interleaved_bloom_filter...
Definition interleaved_bloom_filter.hpp:39
A strong type that represents the number of hash functions for the seqan3::interleaved_bloom_filter.
Definition interleaved_bloom_filter.hpp:46
strong_type for seed.
Definition minimiser_hash.hpp:22
T tie(T... args)
Hide me