SeqAn3 3.1.0
The Modern C++ library for sequence analysis.
interleaved_bloom_filter.hpp
Go to the documentation of this file.
1// -----------------------------------------------------------------------------------------------------
2// Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin
3// Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik
4// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6// -----------------------------------------------------------------------------------------------------
7
13#pragma once
14
15#include <seqan3/std/algorithm>
16#include <seqan3/std/bit>
17
18#include <sdsl/bit_vectors.hpp>
19
22
23namespace seqan3
24{
27enum data_layout : bool
28{
31};
32
35struct bin_count : public detail::strong_type<size_t, bin_count, detail::strong_type_skill::convert>
36{
37 using detail::strong_type<size_t, bin_count, detail::strong_type_skill::convert>::strong_type;
38};
39
42struct bin_size : public detail::strong_type<size_t, bin_size, detail::strong_type_skill::convert>
43{
44 using detail::strong_type<size_t, bin_size, detail::strong_type_skill::convert>::strong_type;
45};
46
49struct hash_function_count : public detail::strong_type<size_t, hash_function_count, detail::strong_type_skill::convert>
50{
51 using detail::strong_type<size_t, hash_function_count, detail::strong_type_skill::convert>::strong_type;
52};
53
56struct bin_index : public detail::strong_type<size_t, bin_index, detail::strong_type_skill::convert>
57{
58 using detail::strong_type<size_t, bin_index, detail::strong_type_skill::convert>::strong_type;
59};
60
132template <data_layout data_layout_mode_ = data_layout::uncompressed>
134{
135private:
137 template <data_layout data_layout_mode>
138 friend class interleaved_bloom_filter;
140
142 using data_type = std::conditional_t<data_layout_mode_ == data_layout::uncompressed,
143 sdsl::bit_vector,
144 sdsl::sd_vector<>>;
145
147 size_t bins{};
149 size_t technical_bins{};
151 size_t bin_size_{};
153 size_t hash_shift{};
155 size_t bin_words{};
157 size_t hash_funs{};
159 data_type data{};
161 static constexpr std::array<size_t, 5> hash_seeds{13572355802537770549ULL, // 2**64 / (e/2)
162 13043817825332782213ULL, // 2**64 / sqrt(2)
163 10650232656628343401ULL, // 2**64 / sqrt(3)
164 16499269484942379435ULL, // 2**64 / (sqrt(5)/2)
165 4893150838803335377ULL}; // 2**64 / (3*pi/5)
166
174 inline constexpr size_t hash_and_fit(size_t h, size_t const seed) const
175 {
176 h *= seed;
177 assert(hash_shift < 64);
178 h ^= h >> hash_shift; // XOR and shift higher bits into lower bits
179 h *= 11400714819323198485ULL; // = 2^64 / golden_ration, to expand h to 64 bit range
180 // Use fastrange (integer modulo without division) if possible.
181#ifdef __SIZEOF_INT128__
182 h = static_cast<uint64_t>((static_cast<__uint128_t>(h) * static_cast<__uint128_t>(bin_size_)) >> 64);
183#else
184 h %= bin_size_;
185#endif
186 h *= technical_bins;
187 return h;
188 }
189
190public:
192 static constexpr data_layout data_layout_mode = data_layout_mode_;
193
194 class membership_agent; // documented upon definition below
195 template <std::integral value_t>
196 class counting_agent_type; // documented upon definition below
197
207
227 {
228 bins = bins_.get();
229 bin_size_ = size.get();
230 hash_funs = funs.get();
231
232 if (bins == 0)
233 throw std::logic_error{"The number of bins must be > 0."};
234 if (hash_funs == 0 || hash_funs > 5)
235 throw std::logic_error{"The number of hash functions must be > 0 and <= 5."};
236 if (bin_size_ == 0)
237 throw std::logic_error{"The size of a bin must be > 0."};
238
239 hash_shift = std::countl_zero(bin_size_);
240 bin_words = (bins + 63) >> 6; // = ceil(bins/64)
241 technical_bins = bin_words << 6; // = bin_words * 64
242 data = sdsl::bit_vector(technical_bins * bin_size_);
243 }
244
260 {
261 std::tie(bins, technical_bins, bin_size_, hash_shift, bin_words, hash_funs) =
262 std::tie(ibf.bins, ibf.technical_bins, ibf.bin_size_, ibf.hash_shift, ibf.bin_words, ibf.hash_funs);
263
264 data = sdsl::sd_vector<>{ibf.data};
265 }
267
283 void emplace(size_t const value, bin_index const bin) noexcept
287 {
288 assert(bin.get() < bins);
289 for (size_t i = 0; i < hash_funs; ++i)
290 {
291 size_t idx = hash_and_fit(value, hash_seeds[i]);
292 idx += bin.get();
293 assert(idx < data.size());
294 data[idx] = 1;
295 };
296 }
297
309 void clear(bin_index const bin) noexcept
313 {
314 assert(bin.get() < bins);
315 for (size_t idx = bin.get(), i = 0; i < bin_size_; idx += technical_bins, ++i)
316 data[idx] = 0;
317 }
318
332 template <typename rng_t>
336 void clear(rng_t && bin_range) noexcept
337 {
338 static_assert(std::ranges::forward_range<rng_t>, "The range of bins to clear must model a forward_range.");
339 static_assert(std::same_as<std::remove_cvref_t<std::ranges::range_reference_t<rng_t>>, bin_index>,
340 "The reference type of the range to clear must be seqan3::bin_index.");
341#ifndef NDEBUG
342 for (auto && bin : bin_range)
343 assert(bin.get() < bins);
344#endif // NDEBUG
345
346 for (size_t offset = 0, i = 0; i < bin_size_; offset += technical_bins, ++i)
347 for (auto && bin : bin_range)
348 data[bin.get() + offset] = 0;
349 }
350
374 void increase_bin_number_to(bin_count const new_bins_)
378 {
379 size_t new_bins = new_bins_.get();
380
381 if (new_bins < bins)
382 throw std::invalid_argument{"The number of new bins must be >= the current number of bins."};
383
384 // Equivalent to ceil(new_bins / 64)
385 size_t new_bin_words = (new_bins + 63) >> 6;
386
387 bins = new_bins;
388
389 if (new_bin_words == bin_words) // No need for internal resize if bin_words does not change.
390 return;
391
392 size_t new_technical_bins = new_bin_words << 6;
393 size_t new_bits = bin_size_ * new_technical_bins;
394
395 size_t idx_{new_bits}, idx{data.size()};
396 size_t delta = new_technical_bins - technical_bins + 64;
397
398 data.resize(new_bits);
399
400 for (size_t i = idx_, j = idx; j > 0; i -= new_technical_bins, j -= technical_bins)
401 {
402 size_t stop = i - new_technical_bins;
403
404 for (size_t ii = i - delta, jj = j - 64; stop && ii >= stop; ii -= 64, jj -= 64)
405 {
406 uint64_t old = data.get_int(jj);
407 data.set_int(jj, 0);
408 data.set_int(ii, old);
409 }
410 }
411
412 bin_words = new_bin_words;
413 technical_bins = new_technical_bins;
414 }
416
432 {
434 }
435
447 template <typename value_t = uint16_t>
449 {
450 return counting_agent_type<value_t>{*this};
451 }
453
460 size_t hash_function_count() const noexcept
461 {
462 return hash_funs;
463 }
464
468 size_t bin_count() const noexcept
469 {
470 return bins;
471 }
472
476 size_t bin_size() const noexcept
477 {
478 return bin_size_;
479 }
480
484 size_t bit_size() const noexcept
485 {
486 return data.size();
487 }
489
498 friend bool operator==(interleaved_bloom_filter const & lhs, interleaved_bloom_filter const & rhs) noexcept
499 {
500 return std::tie(lhs.bins, lhs.technical_bins, lhs.bin_size_, lhs.hash_shift, lhs.bin_words, lhs.hash_funs,
501 lhs.data) ==
502 std::tie(rhs.bins, rhs.technical_bins, rhs.bin_size_, rhs.hash_shift, rhs.bin_words, rhs.hash_funs,
503 rhs.data);
504 }
505
511 friend bool operator!=(interleaved_bloom_filter const & lhs, interleaved_bloom_filter const & rhs) noexcept
512 {
513 return !(lhs == rhs);
514 }
516
527 constexpr data_type & raw_data() noexcept
528 {
529 return data;
530 }
531
533 constexpr data_type const & raw_data() const noexcept
534 {
535 return data;
536 }
538
546 template <cereal_archive archive_t>
547 void CEREAL_SERIALIZE_FUNCTION_NAME(archive_t & archive)
548 {
549 archive(bins);
550 archive(technical_bins);
551 archive(bin_size_);
552 archive(hash_shift);
553 archive(bin_words);
554 archive(hash_funs);
555 archive(data);
556 }
558};
559
570template <data_layout data_layout_mode>
572{
573private:
576
578 ibf_t const * ibf_ptr{nullptr};
579
580public:
581 class binning_bitvector;
582
586 membership_agent() = default;
591 ~membership_agent() = default;
592
597 explicit membership_agent(ibf_t const & ibf) :
598 ibf_ptr(std::addressof(ibf)), result_buffer(ibf.bin_count())
599 {}
601
604
625 [[nodiscard]] binning_bitvector const & bulk_contains(size_t const value) & noexcept
626 {
627 assert(ibf_ptr != nullptr);
628 assert(result_buffer.size() == ibf_ptr->bin_count());
629
630 std::array<size_t, 5> bloom_filter_indices;
631 std::memcpy(&bloom_filter_indices, &ibf_ptr->hash_seeds, sizeof(size_t) * ibf_ptr->hash_funs);
632
633 for (size_t i = 0; i < ibf_ptr->hash_funs; ++i)
634 bloom_filter_indices[i] = ibf_ptr->hash_and_fit(value, bloom_filter_indices[i]);
635
636 for (size_t batch = 0; batch < ibf_ptr->bin_words; ++batch)
637 {
638 size_t tmp{-1ULL};
639 for (size_t i = 0; i < ibf_ptr->hash_funs; ++i)
640 {
641 assert(bloom_filter_indices[i] < ibf_ptr->data.size());
642 tmp &= ibf_ptr->data.get_int(bloom_filter_indices[i]);
643 bloom_filter_indices[i] += 64;
644 }
645
646 result_buffer.data.set_int(batch << 6, tmp);
647 }
648
649 return result_buffer;
650 }
651
652 // `bulk_contains` cannot be called on a temporary, since the object the returned reference points to
653 // is immediately destroyed.
654 [[nodiscard]] binning_bitvector const & bulk_contains(size_t const value) && noexcept = delete;
656
657};
658
660template <data_layout data_layout_mode>
662{
663private:
665 using data_type = sdsl::bit_vector;
667 data_type data{};
668
669 friend class membership_agent;
670
671 template <std::integral value_t>
672 friend class counting_vector;
673
674public:
678 binning_bitvector() = default;
683 ~binning_bitvector() = default;
684
686 explicit binning_bitvector(size_t const size) :
687 data(size)
688 {}
690
692 size_t size() const noexcept
693 {
694 return data.size();
695 }
696
701 auto begin() noexcept
702 {
703 return data.begin();
704 }
705
707 auto begin() const noexcept
708 {
709 return data.begin();
710 }
711
713 auto end() noexcept
714 {
715 return data.end();
716 }
717
719 auto end() const noexcept
720 {
721 return data.end();
722 }
724
729 friend bool operator==(binning_bitvector const & lhs, binning_bitvector const & rhs) noexcept
730 {
731 return lhs.data == rhs.data;
732 }
733
735 friend bool operator!=(binning_bitvector const & lhs, binning_bitvector const & rhs) noexcept
736 {
737 return !(lhs == rhs);
738 }
740
745 auto operator[](size_t const i) noexcept
746 {
747 assert(i < size());
748 return data[i];
749 }
750
752 auto operator[](size_t const i) const noexcept
753 {
754 assert(i < size());
755 return data[i];
756 }
757
765 constexpr data_type & raw_data() noexcept
766 {
767 return data;
768 }
769
771 constexpr data_type const & raw_data() const noexcept
772 {
773 return data;
774 }
776};
777
801template <std::integral value_t>
802class counting_vector : public std::vector<value_t>
803{
804private:
807public:
811 counting_vector() = default;
812 counting_vector(counting_vector const &) = default;
816 ~counting_vector() = default;
817
818 using base_t::base_t;
820
833 template <typename rhs_t>
835 requires std::same_as<rhs_t,
837 ||
838 std::same_as<rhs_t,
841 counting_vector & operator+=(rhs_t const & rhs)
842 {
843 assert(this->size() >= rhs.size()); // The counting vector may be bigger than what we need.
844
845 // Each iteration can handle 64 bits, so we need to iterate `((rhs.size() + 63) >> 6` many times
846 for (size_t batch = 0, bin = 0; batch < ((rhs.size() + 63) >> 6); bin = 64 * ++batch)
847 {
848 size_t tmp = rhs.data.get_int(batch * 64); // get 64 bits starting at position `batch * 64`
849 if (tmp ^ (1ULL<<63)) // This is a special case, because we would shift by 64 (UB) in the while loop.
850 {
851 while (tmp > 0)
852 {
853 // Jump to the next 1 and increment the corresponding vector entry.
854 uint8_t step = std::countr_zero(tmp);
855 bin += step++;
856 tmp >>= step;
857 ++(*this)[bin++];
858 }
859 }
860 else
861 {
862 ++(*this)[bin + 63];
863 }
864 }
865 return *this;
866 }
867
879 {
880 assert(this->size() >= rhs.size()); // The counting vector may be bigger than what we need.
881
882 std::transform(this->begin(), this->end(), rhs.begin(), this->begin(), std::plus<value_t>());
883
884 return *this;
885 }
886};
887
897template <data_layout data_layout_mode>
898template <std::integral value_t>
900{
901private:
904
906 ibf_t const * ibf_ptr{nullptr};
907
910
911public:
921
926 explicit counting_agent_type(ibf_t const & ibf) :
927 ibf_ptr(std::addressof(ibf)), membership_agent(ibf), result_buffer(ibf.bin_count())
928 {}
930
933
956 template <std::ranges::range value_range_t>
957 [[nodiscard]] counting_vector<value_t> const & bulk_count(value_range_t && values) & noexcept
958 {
959 assert(ibf_ptr != nullptr);
960 assert(result_buffer.size() == ibf_ptr->bin_count());
961
962 static_assert(std::ranges::input_range<value_range_t>, "The values must model input_range.");
963 static_assert(std::unsigned_integral<std::ranges::range_value_t<value_range_t>>,
964 "An individual value must be an unsigned integral.");
965
966 std::ranges::fill(result_buffer, 0);
967
968 for (auto && value : values)
969 result_buffer += membership_agent.bulk_contains(value);
970
971 return result_buffer;
972 }
973
974 // `bulk_count` cannot be called on a temporary, since the object the returned reference points to
975 // is immediately destroyed.
976 template <std::ranges::range value_range_t>
977 [[nodiscard]] counting_vector<value_t> const & bulk_count(value_range_t && values) && noexcept = delete;
979
980};
981
982} // namespace seqan3
The <algorithm> header from C++20's standard library.
The <bit> header from C++20's standard library.
Adaptions of concepts from the Cereal library.
A data structure that behaves like a std::vector and can be used to consolidate the results of multip...
Definition: interleaved_bloom_filter.hpp:803
counting_vector & operator+=(rhs_t const &rhs)
Bin-wise adds the bits of a seqan3::interleaved_bloom_filter::membership_agent::binning_bitvector.
Definition: interleaved_bloom_filter.hpp:841
counting_vector & operator+=(counting_vector const &rhs)
Bin-wise addition of two seqan3::counting_vectors.
Definition: interleaved_bloom_filter.hpp:878
counting_vector & operator=(counting_vector const &)=default
Defaulted.
counting_vector & operator=(counting_vector &&)=default
Defaulted.
counting_vector(counting_vector const &)=default
Defaulted.
~counting_vector()=default
Defaulted.
counting_vector(counting_vector &&)=default
Defaulted.
counting_vector()=default
Defaulted.
Manages counting ranges of values for the seqan3::interleaved_bloom_filter.
Definition: interleaved_bloom_filter.hpp:900
counting_vector< value_t > const & bulk_count(value_range_t &&values) &noexcept
Counts the occurrences in each bin for all values in a range.
Definition: interleaved_bloom_filter.hpp:957
counting_agent_type(counting_agent_type &&)=default
Defaulted.
counting_agent_type(counting_agent_type const &)=default
Defaulted.
counting_agent_type & operator=(counting_agent_type const &)=default
Defaulted.
counting_vector< value_t > result_buffer
Stores the result of bulk_count().
Definition: interleaved_bloom_filter.hpp:932
counting_agent_type & operator=(counting_agent_type &&)=default
Defaulted.
A bitvector representing the result of a call to bulk_contains of the seqan3::interleaved_bloom_filte...
Definition: interleaved_bloom_filter.hpp:662
auto end() noexcept
Returns an iterator to the element following the last element of the container.
Definition: interleaved_bloom_filter.hpp:713
auto end() const noexcept
Returns an iterator to the element following the last element of the container.
Definition: interleaved_bloom_filter.hpp:719
constexpr data_type & raw_data() noexcept
Provides direct, unsafe access to the underlying data structure.
Definition: interleaved_bloom_filter.hpp:765
binning_bitvector(size_t const size)
Construct with given size.
Definition: interleaved_bloom_filter.hpp:686
auto operator[](size_t const i) const noexcept
Return the i-th element.
Definition: interleaved_bloom_filter.hpp:752
binning_bitvector & operator=(binning_bitvector &&)=default
Defaulted.
size_t size() const noexcept
Returns the number of elements.
Definition: interleaved_bloom_filter.hpp:692
auto begin() noexcept
Returns an iterator to the first element of the container.
Definition: interleaved_bloom_filter.hpp:701
auto begin() const noexcept
Returns an iterator to the first element of the container.
Definition: interleaved_bloom_filter.hpp:707
binning_bitvector & operator=(binning_bitvector const &)=default
Defaulted.
constexpr data_type const & raw_data() const noexcept
Provides direct, unsafe access to the underlying data structure.
Definition: interleaved_bloom_filter.hpp:771
auto operator[](size_t const i) noexcept
Return the i-th element.
Definition: interleaved_bloom_filter.hpp:745
binning_bitvector(binning_bitvector const &)=default
Defaulted.
friend bool operator==(binning_bitvector const &lhs, binning_bitvector const &rhs) noexcept
Test for equality.
Definition: interleaved_bloom_filter.hpp:729
friend bool operator!=(binning_bitvector const &lhs, binning_bitvector const &rhs) noexcept
Test for inequality.
Definition: interleaved_bloom_filter.hpp:735
Manages membership queries for the seqan3::interleaved_bloom_filter.
Definition: interleaved_bloom_filter.hpp:572
membership_agent & operator=(membership_agent const &)=default
Defaulted.
membership_agent(membership_agent const &)=default
Defaulted.
membership_agent & operator=(membership_agent &&)=default
Defaulted.
membership_agent(membership_agent &&)=default
Defaulted.
binning_bitvector const & bulk_contains(size_t const value) &noexcept
Determines set membership of a given value.
Definition: interleaved_bloom_filter.hpp:625
binning_bitvector result_buffer
Stores the result of bulk_contains().
Definition: interleaved_bloom_filter.hpp:603
The IBF binning directory. A data structure that efficiently answers set-membership queries for multi...
Definition: interleaved_bloom_filter.hpp:134
interleaved_bloom_filter(seqan3::bin_count bins_, seqan3::bin_size size, seqan3::hash_function_count funs=seqan3::hash_function_count{2u})
Construct an uncompressed Interleaved Bloom Filter.
Definition: interleaved_bloom_filter.hpp:221
interleaved_bloom_filter & operator=(interleaved_bloom_filter const &)=default
Defaulted.
size_t hash_function_count() const noexcept
Returns the number of hash functions used in the Interleaved Bloom Filter.
Definition: interleaved_bloom_filter.hpp:460
constexpr data_type const & raw_data() const noexcept
Provides direct, unsafe access to the underlying data structure.
Definition: interleaved_bloom_filter.hpp:533
interleaved_bloom_filter()=default
Defaulted.
counting_agent_type< value_t > counting_agent() const
Returns a seqan3::interleaved_bloom_filter::counting_agent_type to be used for counting.
Definition: interleaved_bloom_filter.hpp:448
constexpr data_type & raw_data() noexcept
Provides direct, unsafe access to the underlying data structure.
Definition: interleaved_bloom_filter.hpp:527
membership_agent membership_agent() const
Returns a seqan3::interleaved_bloom_filter::membership_agent to be used for lookup.
Definition: interleaved_bloom_filter.hpp:431
void emplace(size_t const value, bin_index const bin) noexcept
Inserts a value into a specific bin.
Definition: interleaved_bloom_filter.hpp:283
interleaved_bloom_filter & operator=(interleaved_bloom_filter &&)=default
Defaulted.
friend bool operator!=(interleaved_bloom_filter const &lhs, interleaved_bloom_filter const &rhs) noexcept
Test for inequality.
Definition: interleaved_bloom_filter.hpp:511
interleaved_bloom_filter(interleaved_bloom_filter &&)=default
Defaulted.
void clear(bin_index const bin) noexcept
Clears a specific bin.
Definition: interleaved_bloom_filter.hpp:309
size_t bin_count() const noexcept
Returns the number of bins that the Interleaved Bloom Filter manages.
Definition: interleaved_bloom_filter.hpp:468
size_t bin_size() const noexcept
Returns the size of a single bin that the Interleaved Bloom Filter manages.
Definition: interleaved_bloom_filter.hpp:476
size_t bit_size() const noexcept
Returns the size of the underlying bitvector.
Definition: interleaved_bloom_filter.hpp:484
interleaved_bloom_filter(interleaved_bloom_filter const &)=default
Defaulted.
~interleaved_bloom_filter()=default
Defaulted.
void clear(rng_t &&bin_range) noexcept
Clears a range of bins.
Definition: interleaved_bloom_filter.hpp:336
friend bool operator==(interleaved_bloom_filter const &lhs, interleaved_bloom_filter const &rhs) noexcept
Test for equality.
Definition: interleaved_bloom_filter.hpp:498
static constexpr data_layout data_layout_mode
Indicates whether the Interleaved Bloom Filter is compressed.
Definition: interleaved_bloom_filter.hpp:192
void increase_bin_number_to(bin_count const new_bins_)
Increases the number of bins stored in the Interleaved Bloom Filter.
Definition: interleaved_bloom_filter.hpp:374
interleaved_bloom_filter(interleaved_bloom_filter< data_layout::uncompressed > const &ibf)
Construct a compressed Interleaved Bloom Filter.
Definition: interleaved_bloom_filter.hpp:256
T data(T... args)
@ offset
Sequence (seqan3::field::seq) relative start position (0-based), unsigned value.
data_layout
Determines if the Interleaved Bloom Filter is compressed.
Definition: interleaved_bloom_filter.hpp:28
@ uncompressed
The Interleaved Bloom Filter is uncompressed.
Definition: interleaved_bloom_filter.hpp:29
@ compressed
The Interleaved Bloom Filter is compressed.
Definition: interleaved_bloom_filter.hpp:30
constexpr int countl_zero(T x) noexcept
Returns the number of consecutive 0 bits in the value of x, starting from the most significant bit ("...
Definition: bit:210
constexpr int countr_zero(T x) noexcept
Returns the number of consecutive 0 bits in the value of x, starting from the least significant bit (...
Definition: bit:228
constexpr size_t size
The size of a type pack.
Definition: traits.hpp:151
T memcpy(T... args)
The main SeqAn3 namespace.
Definition: cigar_operation_table.hpp:2
SeqAn specific customisations in the standard namespace.
Provides basic data structure for strong types.
A strong type that represents the number of bins for the seqan3::interleaved_bloom_filter.
Definition: interleaved_bloom_filter.hpp:36
A strong type that represents the bin index for the seqan3::interleaved_bloom_filter.
Definition: interleaved_bloom_filter.hpp:57
A strong type that represents the number of bits for each bin in the seqan3::interleaved_bloom_filter...
Definition: interleaved_bloom_filter.hpp:43
A strong type that represents the number of hash functions for the seqan3::interleaved_bloom_filter.
Definition: interleaved_bloom_filter.hpp:50
strong_type for seed.
Definition: minimiser_hash.hpp:25
T tie(T... args)
T transform(T... args)