SeqAn3 3.2.0
The Modern C++ library for sequence analysis.
bloom_filter.hpp
Go to the documentation of this file.
1// -----------------------------------------------------------------------------------------------------
2// Copyright (c) 2006-2022, Knut Reinert & Freie Universität Berlin
3// Copyright (c) 2016-2022, Knut Reinert & MPI für molekulare Genetik
4// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6// -----------------------------------------------------------------------------------------------------
7
13#pragma once
14
16
17namespace seqan3
18{
19
83template <data_layout data_layout_mode_ = data_layout::uncompressed>
85{
86private:
88 template <data_layout data_layout_mode>
89 friend class bloom_filter;
91
93 using data_type =
95
97 size_t size_in_bits{};
99 size_t hash_shift{};
101 size_t hash_funs{};
103 data_type data{};
105 static constexpr std::array<size_t, 5> hash_seeds{13572355802537770549ULL, // 2**64 / (e/2)
106 13043817825332782213ULL, // 2**64 / sqrt(2)
107 10650232656628343401ULL, // 2**64 / sqrt(3)
108 16499269484942379435ULL, // 2**64 / (sqrt(5)/2)
109 4893150838803335377ULL}; // 2**64 / (3*pi/5)
110
118 inline constexpr size_t hash_and_fit(size_t h, size_t const seed) const
119 {
120 h *= seed;
121 h ^= h >> hash_shift; // XOR and shift higher bits into lower bits
122 h *= 11400714819323198485ULL; // = 2^64 / golden_ration, to expand h to 64 bit range
123 // Use fastrange (integer modulo without division) if possible.
124#ifdef __SIZEOF_INT128__
125 h = static_cast<uint64_t>((static_cast<__uint128_t>(h) * static_cast<__uint128_t>(size_in_bits)) >> 64);
126#else
127 h %= size_in_bits;
128#endif
129 return h;
130 }
131
132public:
134 static constexpr data_layout data_layout_mode = data_layout_mode_;
135
139 bloom_filter() = default;
140 bloom_filter(bloom_filter const &) = default;
141 bloom_filter & operator=(bloom_filter const &) = default;
144 ~bloom_filter() = default;
145
160 {
161 size_in_bits = size.get();
162 hash_funs = funs.get();
163
164 if (hash_funs == 0 || hash_funs > 5)
165 throw std::logic_error{"The number of hash functions must be > 0 and <= 5."};
166 if (size_in_bits == 0)
167 throw std::logic_error{"The size of a bloom filter must be > 0."};
168
169 hash_shift = std::countl_zero(size_in_bits);
170 data = sdsl::bit_vector(size_in_bits);
171 }
172
186 {
187 std::tie(size_in_bits, hash_shift, hash_funs) = std::tie(bf.size_in_bits, bf.hash_shift, bf.hash_funs);
188
189 data = sdsl::sd_vector<>{bf.data};
190 }
192
207 void emplace(size_t const value) noexcept
209 {
210 for (size_t i = 0; i < hash_funs; ++i)
211 {
212 size_t idx = hash_and_fit(value, hash_seeds[i]);
213 assert(idx < data.size());
214 data[idx] = 1;
215 };
216 }
217
230 void reset() noexcept
232 {
233 sdsl::util::_set_zero_bits(data);
234 }
236
251 bool contains(size_t const value) const noexcept
252 {
253 for (size_t i = 0; i < hash_funs; i++)
254 {
255 size_t idx = hash_and_fit(value, hash_seeds[i]);
256 assert(idx < data.size());
257 if (data[idx] == 0)
258 return false;
259 }
260 return true;
261 }
263
282 template <std::ranges::range value_range_t>
283 size_t count(value_range_t && values) const noexcept
284 {
285 static_assert(std::ranges::input_range<value_range_t>, "The values must model input_range.");
286 static_assert(std::unsigned_integral<std::ranges::range_value_t<value_range_t>>,
287 "An individual value must be an unsigned integral.");
288
289 size_t result = 0;
290
291 for (auto && value : values)
292 result += contains(value);
293
294 return result;
295 }
297
304 size_t hash_function_count() const noexcept
305 {
306 return hash_funs;
307 }
308
312 size_t bit_size() const noexcept
313 {
314 return size_in_bits;
315 }
317
326 friend bool operator==(bloom_filter const & lhs, bloom_filter const & rhs) noexcept
327 {
328 return std::tie(lhs.size_in_bits, lhs.hash_shift, lhs.hash_funs, lhs.data)
329 == std::tie(rhs.size_in_bits, rhs.hash_shift, rhs.hash_funs, rhs.data);
330 }
331
337 friend bool operator!=(bloom_filter const & lhs, bloom_filter const & rhs) noexcept
338 {
339 return !(lhs == rhs);
340 }
342
353 constexpr data_type & raw_data() noexcept
354 {
355 return data;
356 }
357
359 constexpr data_type const & raw_data() const noexcept
360 {
361 return data;
362 }
364
372 template <cereal_archive archive_t>
373 void CEREAL_SERIALIZE_FUNCTION_NAME(archive_t & archive)
374 {
375 archive(size_in_bits);
376 archive(hash_shift);
377 archive(hash_funs);
378 archive(data);
379 }
381};
382
383} // namespace seqan3
The Bloom Filter. A data structure that efficiently answers set-membership queries.
Definition: bloom_filter.hpp:85
friend bool operator!=(bloom_filter const &lhs, bloom_filter const &rhs) noexcept
Test for inequality.
Definition: bloom_filter.hpp:337
constexpr data_type & raw_data() noexcept
Provides direct, unsafe access to the underlying data structure.
Definition: bloom_filter.hpp:353
bloom_filter(bloom_filter &&)=default
Defaulted.
static constexpr data_layout data_layout_mode
Indicates whether the Bloom Filter is compressed.
Definition: bloom_filter.hpp:134
constexpr data_type const & raw_data() const noexcept
Provides direct, unsafe access to the underlying data structure.
Definition: bloom_filter.hpp:359
bloom_filter & operator=(bloom_filter const &)=default
Defaulted.
bool contains(size_t const value) const noexcept
Check whether a value is present in the Bloom Filter.
Definition: bloom_filter.hpp:251
bloom_filter(bloom_filter< data_layout::uncompressed > const &bf)
Construct a compressed Bloom Filter.
Definition: bloom_filter.hpp:184
friend bool operator==(bloom_filter const &lhs, bloom_filter const &rhs) noexcept
Test for equality.
Definition: bloom_filter.hpp:326
void reset() noexcept
Remove all values from the Bloom Filter by setting all bits to 0.
Definition: bloom_filter.hpp:230
bloom_filter(bloom_filter const &)=default
Defaulted.
size_t hash_function_count() const noexcept
Returns the number of hash functions used in the Bloom Filter.
Definition: bloom_filter.hpp:304
size_t count(value_range_t &&values) const noexcept
Counts the occurrences for all values in a range.
Definition: bloom_filter.hpp:283
bloom_filter()=default
Defaulted.
size_t bit_size() const noexcept
Returns the size of the underlying bitvector.
Definition: bloom_filter.hpp:312
bloom_filter & operator=(bloom_filter &&)=default
Defaulted.
void emplace(size_t const value) noexcept
Inserts a value into the Bloom Filter.
Definition: bloom_filter.hpp:207
~bloom_filter()=default
Defaulted.
bloom_filter(seqan3::bin_size size, seqan3::hash_function_count funs=seqan3::hash_function_count{2u})
Construct an uncompressed Bloom Filter.
Definition: bloom_filter.hpp:158
T countl_zero(T... args)
data_layout
Determines if the Interleaved Bloom Filter is compressed.
Definition: interleaved_bloom_filter.hpp:28
@ uncompressed
The Interleaved Bloom Filter is uncompressed.
Definition: interleaved_bloom_filter.hpp:29
@ compressed
The Interleaved Bloom Filter is compressed.
Definition: interleaved_bloom_filter.hpp:30
constexpr size_t size
The size of a type pack.
Definition: traits.hpp:146
Provides seqan3::interleaved_bloom_filter.
The main SeqAn3 namespace.
Definition: aligned_sequence_concept.hpp:29
A strong type that represents the number of bits for each bin in the seqan3::interleaved_bloom_filter...
Definition: interleaved_bloom_filter.hpp:43
A strong type that represents the number of hash functions for the seqan3::interleaved_bloom_filter.
Definition: interleaved_bloom_filter.hpp:50
strong_type for seed.
Definition: minimiser_hash.hpp:25
T tie(T... args)