HIBF 1.0.0-rc.1
All Classes Namespaces Files Functions Variables Typedefs Friends Macros Modules Pages Concepts
interleaved_bloom_filter.hpp
Go to the documentation of this file.
1// SPDX-FileCopyrightText: 2006-2025, Knut Reinert & Freie Universität Berlin
2// SPDX-FileCopyrightText: 2016-2025, Knut Reinert & MPI für molekulare Genetik
3// SPDX-License-Identifier: BSD-3-Clause
4
10#pragma once
11
12#include <algorithm> // for fill
13#include <array> // for array
14#include <cassert> // for assert
15#include <concepts> // for integral, same_as, unsigned_integral
16#include <cstdint> // for uint32_t, uint16_t, uint64_t
17#include <cstring> // for size_t
18#include <memory> // for addressof
19#include <ranges> // for range, range_reference_t, range_value_t, forward_range, input_...
20#include <type_traits> // for remove_cvref_t
21#include <vector> // for vector, operator==
22
23#include <cereal/cereal.hpp> // for make_nvp
24#include <cereal/macros.hpp> // for CEREAL_SERIALIZE_FUNCTION_NAME
25#include <cereal/types/base_class.hpp> // for base_class
26#include <cereal/types/vector.hpp> // IWYU pragma: keep
27
28#include <hibf/cereal/concepts.hpp> // for cereal_archive
29#include <hibf/contrib/aligned_allocator.hpp> // for aligned_allocator
30#include <hibf/misc/bit_vector.hpp> // for bit_vector
31#include <hibf/misc/counting_vector.hpp> // for counting_vector
32#include <hibf/misc/next_multiple_of_64.hpp> // for next_multiple_of_64
33#include <hibf/platform.hpp> // for HIBF_CONSTEXPR_VECTOR, HIBF_HAS_AVX512
34
35namespace seqan::hibf
36{
37
38// config.hpp -> misc/insert_iterator.hpp (Needs interleaved_bloom_filter to be a complete class)
39struct config;
40
46{
47 size_t value;
48};
49
55{
56 size_t value;
57};
58
64{
65 size_t value;
66};
67
73{
74 size_t value;
75};
76
149{
150private:
153
155 template <typename t>
156 friend struct cereal::base_class;
157
159 size_t bins{};
161 size_t technical_bins{};
163 size_t bin_size_{};
165 size_t hash_shift{};
167 size_t bin_words{};
169 size_t hash_funs{};
171 static constexpr std::array<size_t, 5> hash_seeds{13572355802537770549ULL, // 2**64 / (e/2)
172 13043817825332782213ULL, // 2**64 / sqrt(2)
173 10650232656628343401ULL, // 2**64 / sqrt(3)
174 16499269484942379435ULL, // 2**64 / (sqrt(5)/2)
175 4893150838803335377ULL}; // 2**64 / (3*pi/5)
176
184 [[gnu::always_inline]] inline constexpr size_t hash_and_fit(size_t h, size_t const seed) const
185 {
186 h *= seed;
187 assert(hash_shift < 64);
188 h ^= h >> hash_shift; // XOR and shift higher bits into lower bits
189 h *= 11400714819323198485ULL; // = 2^64 / golden_ration, to expand h to 64 bit range
190 // Use fastrange (integer modulo without division).
191 h = static_cast<uint64_t>((static_cast<__uint128_t>(h) * static_cast<__uint128_t>(bin_size_)) >> 64);
192 h *= technical_bins;
193 return h;
194 }
195
196public:
197 class membership_agent_type; // documented upon definition below
198 template <std::integral value_t>
199 class counting_agent_type; // documented upon definition below
200
208 interleaved_bloom_filter & operator=(interleaved_bloom_filter &&) noexcept = default;
210
223 interleaved_bloom_filter(seqan::hibf::bin_count const bins_,
224 seqan::hibf::bin_size const size,
225 seqan::hibf::hash_function_count const funs = seqan::hibf::hash_function_count{2u},
226 bool const track_occupancy_ = false);
227
239 interleaved_bloom_filter(config & configuration, size_t const max_bin_elements = 0u);
241
257 void emplace(size_t const value, bin_index const bin) noexcept;
258
268 void clear(bin_index const bin) noexcept;
269
281 template <typename rng_t>
282 void clear(rng_t && bin_range) noexcept
283 {
284 static_assert(std::ranges::forward_range<rng_t>, "The range of bins to clear must model a forward_range.");
286 "The reference type of the range to clear must be seqan::hibf::bin_index.");
287#ifndef NDEBUG
288 for (auto && bin : bin_range)
289 assert(bin.value < technical_bins);
290#endif // NDEBUG
291
292 for (size_t offset = 0, i = 0; i < bin_size_; offset += technical_bins, ++i)
293 for (auto && bin : bin_range)
294 (*this)[bin.value + offset] = 0;
295 }
296
318 bool try_increase_bin_number_to(bin_count const new_bin_count) noexcept;
319
342 void increase_bin_number_to(bin_count const new_bin_count);
344
360
372 template <typename value_t = uint16_t>
378
385 size_t hash_function_count() const noexcept
386 {
387 return hash_funs;
388 }
389
393 size_t bin_count() const noexcept
394 {
395 return bins;
396 }
397
401 size_t bin_size() const noexcept
402 {
403 return bin_size_;
404 }
405
409 size_t bit_size() const noexcept
410 {
411 return base_t::size();
412 }
414
418 HIBF_CONSTEXPR_VECTOR bool operator==(interleaved_bloom_filter const &) const = default;
420
431 using base_t::data;
433
441
443 bool track_occupancy{false};
444
448 static constexpr uint32_t version{1};
449
456 template <cereal_archive archive_t>
457 void CEREAL_SERIALIZE_FUNCTION_NAME(archive_t & archive)
458 {
459 uint32_t parsed_version{version};
460 archive(cereal::make_nvp("version", parsed_version));
461 archive(bins);
462 archive(technical_bins);
463 archive(bin_size_);
464 archive(hash_shift);
465 archive(bin_words);
466 archive(hash_funs);
467 archive(cereal::base_class<base_t>(this));
468 archive(occupancy);
469 archive(track_occupancy);
470 }
472};
473
485{
486private:
488 interleaved_bloom_filter const * ibf_ptr{nullptr};
489
491 std::array<size_t, 5> bloom_filter_indices;
492
494 bit_vector result_buffer;
495
496public:
504 membership_agent_type & operator=(membership_agent_type &&) noexcept = default;
506
511 explicit membership_agent_type(interleaved_bloom_filter const & ibf) :
512 ibf_ptr(std::addressof(ibf)),
513 result_buffer(ibf.bin_count())
514 {}
516
537 [[nodiscard]] bit_vector const & bulk_contains(size_t const value) & noexcept;
538
539 // `bulk_contains` cannot be called on a temporary, since the object the returned reference points to
540 // is immediately destroyed.
541 [[nodiscard]] bit_vector const & bulk_contains(size_t const value) && noexcept = delete;
543};
544
549
559template <std::integral value_t>
561{
562private:
564 interleaved_bloom_filter const * ibf_ptr{nullptr};
565
567 membership_agent_type membership_agent;
568
570 counting_vector<value_t> result_buffer;
571
572public:
580 counting_agent_type & operator=(counting_agent_type &&) noexcept = default;
581 ~counting_agent_type() = default;
582
587 explicit counting_agent_type(interleaved_bloom_filter const & ibf) :
588 ibf_ptr(std::addressof(ibf)),
589 membership_agent(ibf),
590#if !HIBF_HAS_AVX512
591 result_buffer(ibf.bin_count())
592 {}
593#else
594 // AVX512 will access result_buffer's memory in chunks, so we need to make sure that we allocate enough memory
595 // such that the last chunk is not out of bounds.
596 result_buffer(next_multiple_of_64(ibf.bin_count())) // Ensure large enough capacity.
597 {
598 result_buffer.resize(ibf.bin_count()); // Resize to actual requested size.
599 // Silences llvm's ASAN container-overflow warning.
600# if defined(_LIBCPP_VERSION) && !defined(_LIBCPP_HAS_NO_ASAN)
601 __sanitizer_annotate_contiguous_container(result_buffer.data(),
602 result_buffer.data() + result_buffer.capacity(),
603 result_buffer.data() + result_buffer.size(),
604 result_buffer.data() + result_buffer.capacity());
605# endif
606 }
607#endif
608
610
633 template <std::ranges::range value_range_t>
634 [[nodiscard]] counting_vector<value_t> const & bulk_count(value_range_t && values) & noexcept
635 {
636 assert(ibf_ptr != nullptr);
637 assert(result_buffer.size() == ibf_ptr->bin_count());
638
639 static_assert(std::ranges::input_range<value_range_t>, "The values must model input_range.");
641 "An individual value must be an unsigned integral.");
642
643 std::ranges::fill(result_buffer, 0);
644
645 for (auto && value : values)
646 result_buffer += membership_agent.bulk_contains(value);
647
648 return result_buffer;
649 }
650
651 // `bulk_count` cannot be called on a temporary, since the object the returned reference points to
652 // is immediately destroyed.
653 template <std::ranges::range value_range_t>
654 [[nodiscard]] counting_vector<value_t> const & bulk_count(value_range_t && values) && noexcept = delete;
656};
657
658} // namespace seqan::hibf
Provides seqan::hibf::bit_vector.
T capacity(T... args)
An bit vector.
Definition bit_vector.hpp:68
constexpr size_type size() const noexcept
Returns the number of elements.
Definition bit_vector.hpp:624
A data structure that behaves like a std::vector and can be used to consolidate the results of multip...
Definition counting_vector.hpp:146
Manages counting ranges of values for the seqan::hibf::interleaved_bloom_filter.
Definition interleaved_bloom_filter.hpp:561
counting_vector< value_t > const & bulk_count(value_range_t &&values) &noexcept
Counts the occurrences in each bin for all values in a range.
Definition interleaved_bloom_filter.hpp:634
counting_vector< value_t > const & bulk_count(value_range_t &&values) &&noexcept=delete
Counts the occurrences in each bin for all values in a range.
counting_agent_type(counting_agent_type const &)=default
Defaulted.
counting_agent_type(counting_agent_type &&) noexcept=default
Defaulted.
counting_agent_type & operator=(counting_agent_type const &)=default
Defaulted.
Manages membership queries for the seqan::hibf::interleaved_bloom_filter.
Definition interleaved_bloom_filter.hpp:485
membership_agent_type(membership_agent_type const &)=default
Defaulted.
bit_vector const & bulk_contains(size_t const value) &noexcept
Determines set membership of a given value.
membership_agent_type(membership_agent_type &&) noexcept=default
Defaulted.
membership_agent_type & operator=(membership_agent_type const &)=default
Defaulted.
bit_vector const & bulk_contains(size_t const value) &&noexcept=delete
Determines set membership of a given value.
The IBF binning directory. A data structure that efficiently answers set-membership queries for multi...
Definition interleaved_bloom_filter.hpp:149
bool track_occupancy
Whether to track the occupancy of the bins.
Definition interleaved_bloom_filter.hpp:443
std::vector< size_t > occupancy
Contains the number of unique values inserted into each bin.
Definition interleaved_bloom_filter.hpp:440
size_t bit_size() const noexcept
Returns the size of the underlying bitvector.
Definition interleaved_bloom_filter.hpp:409
interleaved_bloom_filter(config &configuration, size_t const max_bin_elements=0u)
Construct an Interleaved Bloom Filter.
interleaved_bloom_filter()=default
Defaulted.
counting_agent_type< value_t > counting_agent() const
Returns a seqan::hibf::interleaved_bloom_filter::counting_agent_type to be used for counting.
Definition interleaved_bloom_filter.hpp:373
void increase_bin_number_to(bin_count const new_bin_count)
Increases the number of bins stored in the Interleaved Bloom Filter.
friend struct cereal::base_class
Allow cereal to cast the IBF into its base class.
Definition interleaved_bloom_filter.hpp:156
size_t hash_function_count() const noexcept
Returns the number of hash functions used in the Interleaved Bloom Filter.
Definition interleaved_bloom_filter.hpp:385
void emplace(size_t const value, bin_index const bin) noexcept
Inserts a value into a specific bin.
void clear(bin_index const bin) noexcept
Clears a specific bin.
interleaved_bloom_filter & operator=(interleaved_bloom_filter const &)=default
Defaulted.
bool try_increase_bin_number_to(bin_count const new_bin_count) noexcept
Tries increasing the number of bins stored in the Interleaved Bloom Filter without reallocating memor...
void clear(rng_t &&bin_range) noexcept
Clears a range of bins.
Definition interleaved_bloom_filter.hpp:282
membership_agent_type membership_agent() const
Returns a seqan::hibf::interleaved_bloom_filter::membership_agent_type to be used for lookup.
Definition interleaved_bloom_filter.hpp:545
static constexpr uint32_t version
The version of the HIBF.
Definition interleaved_bloom_filter.hpp:448
interleaved_bloom_filter(interleaved_bloom_filter const &)=default
Defaulted.
interleaved_bloom_filter(interleaved_bloom_filter &&) noexcept=default
Defaulted.
size_t bin_size() const noexcept
Returns the size of a single bin that the Interleaved Bloom Filter manages.
Definition interleaved_bloom_filter.hpp:401
size_t bin_count() const noexcept
Returns the number of bins that the Interleaved Bloom Filter manages.
Definition interleaved_bloom_filter.hpp:393
Provides seqan::hibf::counting_vector.
T data(T... args)
T fill(T... args)
constexpr size_t next_multiple_of_64(size_t const value) noexcept
Returns the smallest integer that is greater or equal to value and a multiple of 64....
Definition next_multiple_of_64.hpp:18
T is_base_of_v
Provides platform and dependency checks.
#define HIBF_CONSTEXPR_VECTOR
std::vector constexpr support.
Definition platform.hpp:120
T resize(T... args)
T size(T... args)
A strong type that represents the number of bins for the seqan::hibf::interleaved_bloom_filter.
Definition interleaved_bloom_filter.hpp:46
A strong type that represents the bin index for the seqan::hibf::interleaved_bloom_filter.
Definition interleaved_bloom_filter.hpp:73
A strong type that represents the number of bits for each bin in the seqan::hibf::interleaved_bloom_f...
Definition interleaved_bloom_filter.hpp:55
The configuration used to build an (H)IBF.
Definition config.hpp:75
A strong type that represents the number of hash functions for the seqan::hibf::interleaved_bloom_fil...
Definition interleaved_bloom_filter.hpp:64