SeqAn3 3.4.0-rc.4
The Modern C++ library for sequence analysis.
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages Concepts
bloom_filter.hpp
Go to the documentation of this file.
1// SPDX-FileCopyrightText: 2006-2025 Knut Reinert & Freie Universität Berlin
2// SPDX-FileCopyrightText: 2016-2025 Knut Reinert & MPI für molekulare Genetik
3// SPDX-License-Identifier: BSD-3-Clause
4
10#pragma once
11
12#include <seqan3/contrib/sdsl-lite.hpp>
14//Todo: When removing search/dream_index/interleaved_bloom_filter.hpp, the contents of the following header can be
15// moved into this file
17
18namespace seqan3
19{
20
84template <data_layout data_layout_mode_ = data_layout::uncompressed>
86{
87private:
89 template <data_layout data_layout_mode>
90 friend class bloom_filter;
92
95 seqan3::contrib::sdsl::bit_vector,
96 seqan3::contrib::sdsl::sd_vector<>>;
97
99 size_t size_in_bits{};
101 size_t hash_shift{};
103 size_t hash_funs{};
105 data_type data{};
107 static constexpr std::array<size_t, 5> hash_seeds{13'572'355'802'537'770'549ULL, // 2**64 / (e/2)
108 13'043'817'825'332'782'213ULL, // 2**64 / sqrt(2)
109 10'650'232'656'628'343'401ULL, // 2**64 / sqrt(3)
110 16'499'269'484'942'379'435ULL, // 2**64 / (sqrt(5)/2)
111 4'893'150'838'803'335'377ULL}; // 2**64 / (3*pi/5)
112
120 inline constexpr size_t hash_and_fit(size_t h, size_t const seed) const
121 {
122 h *= seed;
123 h ^= h >> hash_shift; // XOR and shift higher bits into lower bits
124 h *= 11'400'714'819'323'198'485ULL; // = 2^64 / golden_ration, to expand h to 64 bit range
125 // Use fastrange (integer modulo without division) if possible.
126#ifdef __SIZEOF_INT128__
127 h = static_cast<uint64_t>((static_cast<__uint128_t>(h) * static_cast<__uint128_t>(size_in_bits)) >> 64);
128#else
129 h %= size_in_bits;
130#endif
131 return h;
132 }
133
134public:
137
141 bloom_filter() = default;
142 bloom_filter(bloom_filter const &) = default;
143 bloom_filter & operator=(bloom_filter const &) = default;
146 ~bloom_filter() = default;
147
162 {
163 size_in_bits = size.get();
164 hash_funs = funs.get();
165
166 if (hash_funs == 0 || hash_funs > 5)
167 throw std::logic_error{"The number of hash functions must be > 0 and <= 5."};
168 if (size_in_bits == 0)
169 throw std::logic_error{"The size of a bloom filter must be > 0."};
170
171 hash_shift = std::countl_zero(size_in_bits);
172 data = seqan3::contrib::sdsl::bit_vector(size_in_bits);
173 }
174
188 {
189 std::tie(size_in_bits, hash_shift, hash_funs) = std::tie(bf.size_in_bits, bf.hash_shift, bf.hash_funs);
190
191 data = seqan3::contrib::sdsl::sd_vector<>{bf.data};
192 }
194
209 void emplace(size_t const value) noexcept
211 {
212 for (size_t i = 0; i < hash_funs; ++i)
213 {
214 size_t idx = hash_and_fit(value, hash_seeds[i]);
215 assert(idx < data.size());
216 data[idx] = 1;
217 };
218 }
219
234 {
235 seqan3::contrib::sdsl::util::_set_zero_bits(data);
236 }
238
253 bool contains(size_t const value) const noexcept
254 {
255 for (size_t i = 0; i < hash_funs; i++)
256 {
257 size_t idx = hash_and_fit(value, hash_seeds[i]);
258 assert(idx < data.size());
259 if (data[idx] == 0)
260 return false;
261 }
262 return true;
263 }
265
284 template <std::ranges::range value_range_t>
285 size_t count(value_range_t && values) const noexcept
286 {
287 static_assert(std::ranges::input_range<value_range_t>, "The values must model input_range.");
288 static_assert(std::unsigned_integral<std::ranges::range_value_t<value_range_t>>,
289 "An individual value must be an unsigned integral.");
290
291 size_t result = 0;
292
293 for (auto && value : values)
294 result += contains(value);
295
296 return result;
297 }
299
307 {
308 return hash_funs;
309 }
310
315 {
316 return size_in_bits;
317 }
319
328 friend bool operator==(bloom_filter const & lhs, bloom_filter const & rhs) noexcept
329 {
330 return std::tie(lhs.size_in_bits, lhs.hash_shift, lhs.hash_funs, lhs.data)
331 == std::tie(rhs.size_in_bits, rhs.hash_shift, rhs.hash_funs, rhs.data);
332 }
333
339 friend bool operator!=(bloom_filter const & lhs, bloom_filter const & rhs) noexcept
340 {
341 return !(lhs == rhs);
342 }
344
356 {
357 return data;
358 }
359
361 constexpr data_type const & raw_data() const noexcept
362 {
363 return data;
364 }
366
374 template <cereal_archive archive_t>
376 {
377 archive(size_in_bits);
378 archive(hash_shift);
379 archive(hash_funs);
380 archive(data);
381 }
383};
384
385} // namespace seqan3
Provides strong types for the (Interleaved) Bloom Filter.
Adaptions of concepts from the Cereal library.
The Bloom Filter. A data structure that efficiently answers set-membership queries.
Definition bloom_filter.hpp:86
friend bool operator!=(bloom_filter const &lhs, bloom_filter const &rhs) noexcept
Test for inequality.
Definition bloom_filter.hpp:339
constexpr data_type & raw_data() noexcept
Provides direct, unsafe access to the underlying data structure.
Definition bloom_filter.hpp:355
bloom_filter(bloom_filter &&)=default
Defaulted.
static constexpr data_layout data_layout_mode
Indicates whether the Bloom Filter is compressed.
Definition bloom_filter.hpp:136
constexpr data_type const & raw_data() const noexcept
Provides direct, unsafe access to the underlying data structure.
Definition bloom_filter.hpp:361
bloom_filter & operator=(bloom_filter const &)=default
Defaulted.
bool contains(size_t const value) const noexcept
Check whether a value is present in the Bloom Filter.
Definition bloom_filter.hpp:253
bloom_filter(bloom_filter< data_layout::uncompressed > const &bf)
Construct a compressed Bloom Filter.
Definition bloom_filter.hpp:186
friend bool operator==(bloom_filter const &lhs, bloom_filter const &rhs) noexcept
Test for equality.
Definition bloom_filter.hpp:328
void reset() noexcept
Remove all values from the Bloom Filter by setting all bits to 0.
Definition bloom_filter.hpp:232
bloom_filter(bloom_filter const &)=default
Defaulted.
size_t hash_function_count() const noexcept
Returns the number of hash functions used in the Bloom Filter.
Definition bloom_filter.hpp:306
size_t count(value_range_t &&values) const noexcept
Counts the occurrences for all values in a range.
Definition bloom_filter.hpp:285
bloom_filter()=default
Defaulted.
size_t bit_size() const noexcept
Returns the size of the underlying bitvector.
Definition bloom_filter.hpp:314
bloom_filter & operator=(bloom_filter &&)=default
Defaulted.
void emplace(size_t const value) noexcept
Inserts a value into the Bloom Filter.
Definition bloom_filter.hpp:209
~bloom_filter()=default
Defaulted.
bloom_filter(seqan3::bin_size size, seqan3::hash_function_count funs=seqan3::hash_function_count{2u})
Construct an uncompressed Bloom Filter.
Definition bloom_filter.hpp:160
A "pretty printer" for most SeqAn data structures and related types.
Definition debug_stream_type.hpp:79
T countl_zero(T... args)
data_layout
Determines if the Interleaved Bloom Filter is compressed.
Definition bloom_filter_strong_types.hpp:23
@ uncompressed
The Interleaved Bloom Filter is uncompressed.
Definition bloom_filter_strong_types.hpp:24
@ compressed
The Interleaved Bloom Filter is compressed.
Definition bloom_filter_strong_types.hpp:25
The main SeqAn3 namespace.
Definition aligned_sequence_concept.hpp:26
A strong type that represents the number of bits for each bin in the seqan3::interleaved_bloom_filter...
Definition bloom_filter_strong_types.hpp:38
A strong type that represents the number of hash functions for the seqan3::interleaved_bloom_filter.
Definition bloom_filter_strong_types.hpp:45
strong_type for seed.
Definition minimiser_hash.hpp:22
T tie(T... args)
Hide me