SeqAn3 3.4.0-rc.1
The Modern C++ library for sequence analysis.
Loading...
Searching...
No Matches
bloom_filter.hpp
Go to the documentation of this file.
1// SPDX-FileCopyrightText: 2006-2024 Knut Reinert & Freie Universität Berlin
2// SPDX-FileCopyrightText: 2016-2024 Knut Reinert & MPI für molekulare Genetik
3// SPDX-License-Identifier: BSD-3-Clause
4
10#pragma once
11
13
14namespace seqan3
15{
16
80template <data_layout data_layout_mode_ = data_layout::uncompressed>
82{
83private:
85 template <data_layout data_layout_mode>
86 friend class bloom_filter;
88
90 using data_type =
92
94 size_t size_in_bits{};
96 size_t hash_shift{};
98 size_t hash_funs{};
100 data_type data{};
102 static constexpr std::array<size_t, 5> hash_seeds{13'572'355'802'537'770'549ULL, // 2**64 / (e/2)
103 13'043'817'825'332'782'213ULL, // 2**64 / sqrt(2)
104 10'650'232'656'628'343'401ULL, // 2**64 / sqrt(3)
105 16'499'269'484'942'379'435ULL, // 2**64 / (sqrt(5)/2)
106 4'893'150'838'803'335'377ULL}; // 2**64 / (3*pi/5)
107
115 inline constexpr size_t hash_and_fit(size_t h, size_t const seed) const
116 {
117 h *= seed;
118 h ^= h >> hash_shift; // XOR and shift higher bits into lower bits
119 h *= 11'400'714'819'323'198'485ULL; // = 2^64 / golden_ration, to expand h to 64 bit range
120 // Use fastrange (integer modulo without division) if possible.
121#ifdef __SIZEOF_INT128__
122 h = static_cast<uint64_t>((static_cast<__uint128_t>(h) * static_cast<__uint128_t>(size_in_bits)) >> 64);
123#else
124 h %= size_in_bits;
125#endif
126 return h;
127 }
128
129public:
132
136 bloom_filter() = default;
137 bloom_filter(bloom_filter const &) = default;
138 bloom_filter & operator=(bloom_filter const &) = default;
141 ~bloom_filter() = default;
142
157 {
158 size_in_bits = size.get();
159 hash_funs = funs.get();
160
161 if (hash_funs == 0 || hash_funs > 5)
162 throw std::logic_error{"The number of hash functions must be > 0 and <= 5."};
163 if (size_in_bits == 0)
164 throw std::logic_error{"The size of a bloom filter must be > 0."};
165
166 hash_shift = std::countl_zero(size_in_bits);
167 data = sdsl::bit_vector(size_in_bits);
168 }
169
183 {
184 std::tie(size_in_bits, hash_shift, hash_funs) = std::tie(bf.size_in_bits, bf.hash_shift, bf.hash_funs);
185
186 data = sdsl::sd_vector<>{bf.data};
187 }
189
204 void emplace(size_t const value) noexcept
206 {
207 for (size_t i = 0; i < hash_funs; ++i)
208 {
209 size_t idx = hash_and_fit(value, hash_seeds[i]);
210 assert(idx < data.size());
211 data[idx] = 1;
212 };
213 }
214
229 {
230 sdsl::util::_set_zero_bits(data);
231 }
233
248 bool contains(size_t const value) const noexcept
249 {
250 for (size_t i = 0; i < hash_funs; i++)
251 {
252 size_t idx = hash_and_fit(value, hash_seeds[i]);
253 assert(idx < data.size());
254 if (data[idx] == 0)
255 return false;
256 }
257 return true;
258 }
260
279 template <std::ranges::range value_range_t>
280 size_t count(value_range_t && values) const noexcept
281 {
282 static_assert(std::ranges::input_range<value_range_t>, "The values must model input_range.");
283 static_assert(std::unsigned_integral<std::ranges::range_value_t<value_range_t>>,
284 "An individual value must be an unsigned integral.");
285
286 size_t result = 0;
287
288 for (auto && value : values)
289 result += contains(value);
290
291 return result;
292 }
294
302 {
303 return hash_funs;
304 }
305
310 {
311 return size_in_bits;
312 }
314
323 friend bool operator==(bloom_filter const & lhs, bloom_filter const & rhs) noexcept
324 {
325 return std::tie(lhs.size_in_bits, lhs.hash_shift, lhs.hash_funs, lhs.data)
326 == std::tie(rhs.size_in_bits, rhs.hash_shift, rhs.hash_funs, rhs.data);
327 }
328
334 friend bool operator!=(bloom_filter const & lhs, bloom_filter const & rhs) noexcept
335 {
336 return !(lhs == rhs);
337 }
339
351 {
352 return data;
353 }
354
356 constexpr data_type const & raw_data() const noexcept
357 {
358 return data;
359 }
361
369 template <cereal_archive archive_t>
371 {
372 archive(size_in_bits);
373 archive(hash_shift);
374 archive(hash_funs);
375 archive(data);
376 }
378};
379
380} // namespace seqan3
The Bloom Filter. A data structure that efficiently answers set-membership queries.
Definition bloom_filter.hpp:82
friend bool operator!=(bloom_filter const &lhs, bloom_filter const &rhs) noexcept
Test for inequality.
Definition bloom_filter.hpp:334
constexpr data_type & raw_data() noexcept
Provides direct, unsafe access to the underlying data structure.
Definition bloom_filter.hpp:350
bloom_filter(bloom_filter &&)=default
Defaulted.
static constexpr data_layout data_layout_mode
Indicates whether the Bloom Filter is compressed.
Definition bloom_filter.hpp:131
constexpr data_type const & raw_data() const noexcept
Provides direct, unsafe access to the underlying data structure.
Definition bloom_filter.hpp:356
bloom_filter & operator=(bloom_filter const &)=default
Defaulted.
bool contains(size_t const value) const noexcept
Check whether a value is present in the Bloom Filter.
Definition bloom_filter.hpp:248
bloom_filter(bloom_filter< data_layout::uncompressed > const &bf)
Construct a compressed Bloom Filter.
Definition bloom_filter.hpp:181
friend bool operator==(bloom_filter const &lhs, bloom_filter const &rhs) noexcept
Test for equality.
Definition bloom_filter.hpp:323
void reset() noexcept
Remove all values from the Bloom Filter by setting all bits to 0.
Definition bloom_filter.hpp:227
bloom_filter(bloom_filter const &)=default
Defaulted.
size_t hash_function_count() const noexcept
Returns the number of hash functions used in the Bloom Filter.
Definition bloom_filter.hpp:301
size_t count(value_range_t &&values) const noexcept
Counts the occurrences for all values in a range.
Definition bloom_filter.hpp:280
bloom_filter()=default
Defaulted.
size_t bit_size() const noexcept
Returns the size of the underlying bitvector.
Definition bloom_filter.hpp:309
bloom_filter & operator=(bloom_filter &&)=default
Defaulted.
void emplace(size_t const value) noexcept
Inserts a value into the Bloom Filter.
Definition bloom_filter.hpp:204
~bloom_filter()=default
Defaulted.
bloom_filter(seqan3::bin_size size, seqan3::hash_function_count funs=seqan3::hash_function_count{2u})
Construct an uncompressed Bloom Filter.
Definition bloom_filter.hpp:155
A "pretty printer" for most SeqAn data structures and related types.
Definition debug_stream_type.hpp:79
T countl_zero(T... args)
data_layout
Determines if the Interleaved Bloom Filter is compressed.
Definition interleaved_bloom_filter.hpp:25
@ uncompressed
The Interleaved Bloom Filter is uncompressed.
Definition interleaved_bloom_filter.hpp:26
@ compressed
The Interleaved Bloom Filter is compressed.
Definition interleaved_bloom_filter.hpp:27
Provides seqan3::interleaved_bloom_filter.
The main SeqAn3 namespace.
Definition aligned_sequence_concept.hpp:26
A strong type that represents the number of bits for each bin in the seqan3::interleaved_bloom_filter...
Definition interleaved_bloom_filter.hpp:40
A strong type that represents the number of hash functions for the seqan3::interleaved_bloom_filter.
Definition interleaved_bloom_filter.hpp:47
strong_type for seed.
Definition minimiser_hash.hpp:22
T tie(T... args)
Hide me