SeqAn3  3.0.2
The Modern C++ library for sequence analysis.
interleaved_bloom_filter.hpp
Go to the documentation of this file.
1 // -----------------------------------------------------------------------------------------------------
2 // Copyright (c) 2006-2020, Knut Reinert & Freie Universität Berlin
3 // Copyright (c) 2016-2020, Knut Reinert & MPI für molekulare Genetik
4 // This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5 // shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6 // -----------------------------------------------------------------------------------------------------
7 
13 #pragma once
14 
15 #include <sdsl/bit_vectors.hpp>
16 
20 #include <seqan3/std/algorithm>
21 
22 namespace seqan3
23 {
24 
28 enum data_layout : bool
30 {
32  compressed
33 };
34 
36 struct bin_count : public detail::strong_type<size_t, bin_count, detail::strong_type_skill::convert>
37 {
38  using detail::strong_type<size_t, bin_count, detail::strong_type_skill::convert>::strong_type;
39 };
40 
42 struct bin_size : public detail::strong_type<size_t, bin_size, detail::strong_type_skill::convert>
43 {
44  using detail::strong_type<size_t, bin_size, detail::strong_type_skill::convert>::strong_type;
45 };
46 
48 struct hash_function_count : public detail::strong_type<size_t, hash_function_count, detail::strong_type_skill::convert>
49 {
50  using detail::strong_type<size_t, hash_function_count, detail::strong_type_skill::convert>::strong_type;
51 };
52 
54 struct bin_index : public detail::strong_type<size_t, bin_index, detail::strong_type_skill::convert>
55 {
56  using detail::strong_type<size_t, bin_index, detail::strong_type_skill::convert>::strong_type;
57 };
58 
125 template <data_layout data_layout_mode_ = data_layout::uncompressed>
127 {
128 private:
130  template <data_layout data_layout_mode>
131  friend class interleaved_bloom_filter;
132 
133  class membership_agent;
135 
137  using data_type = std::conditional_t<data_layout_mode_ == data_layout::uncompressed,
138  sdsl::bit_vector,
139  sdsl::sd_vector<>>;
140 
142  size_t bins{};
144  size_t technical_bins{};
146  size_t bin_size_{};
148  size_t hash_shift{};
150  size_t bin_words{};
152  size_t hash_funs{};
154  data_type data{};
156  static constexpr std::array<size_t, 5> hash_seeds{13572355802537770549ULL, // 2**64 / (e/2)
157  13043817825332782213ULL, // 2**64 / sqrt(2)
158  10650232656628343401ULL, // 2**64 / sqrt(3)
159  16499269484942379435ULL, // 2**64 / (sqrt(5)/2)
160  4893150838803335377ULL}; // 2**64 / (3*pi/5)
161 
169  inline constexpr size_t hash_and_fit(size_t h, size_t const seed) const
170  {
171  h *= seed;
172  assert(hash_shift < 64);
173  h ^= h >> hash_shift; // XOR and shift higher bits into lower bits
174  h *= 11400714819323198485ULL; // = 2^64 / golden_ration, to expand h to 64 bit range
175  // Use fastrange (integer modulo without division) if possible.
176 #ifdef __SIZEOF_INT128__
177  h = static_cast<uint64_t>((static_cast<__uint128_t>(h) * static_cast<__uint128_t>(bin_size_)) >> 64);
178 #else
179  h %= bin_size_;
180 #endif
181  h *= technical_bins;
182  return h;
183  }
184 
185 public:
187  static constexpr data_layout data_layout_mode = data_layout_mode_;
188 
198 
218  {
219  bins = bins_.get();
220  bin_size_ = size.get();
221  hash_funs = funs.get();
222 
223  if (bins == 0)
224  throw std::logic_error{"The number of bins must be > 0."};
225  if (hash_funs == 0 || hash_funs > 5)
226  throw std::logic_error{"The number of hash functions must be > 0 and <= 5."};
227  if (bin_size_ == 0)
228  throw std::logic_error{"The size of a bin must be > 0."};
229 
230  hash_shift = detail::count_leading_zeros(bin_size_);
231  bin_words = (bins + 63) >> 6; // = ceil(bins/64)
232  technical_bins = bin_words << 6; // = bin_words * 64
233  data = sdsl::bit_vector(technical_bins * bin_size_);
234  }
235 
251  {
252  std::tie(bins, technical_bins, bin_size_, hash_shift, bin_words, hash_funs) =
253  std::tie(ibf.bins, ibf.technical_bins, ibf.bin_size_, ibf.hash_shift, ibf.bin_words, ibf.hash_funs);
254 
255  data = sdsl::sd_vector<>{ibf.data};
256  }
258 
274  void emplace(size_t const value, bin_index const bin)
278  {
279  assert(bin.get() < bins);
280  for (size_t i = 0; i < hash_funs; ++i)
281  {
282  size_t idx = hash_and_fit(value, hash_seeds[i]);
283  idx += bin.get();
284  assert(idx < data.size());
285  data[idx] = 1;
286  };
287  }
288 
312  void increase_bin_number_to(bin_count const new_bins_)
316  {
317  size_t new_bins = new_bins_.get();
318 
319  if (new_bins < bins)
320  throw std::invalid_argument{"The number of new bins must be >= the current number of bins."};
321 
322  // Equivalent to ceil(new_bins / 64)
323  size_t new_bin_words = (new_bins + 63) >> 6;
324 
325  bins = new_bins;
326 
327  if (new_bin_words == bin_words) // No need for internal resize if bin_words does not change.
328  return;
329 
330  size_t new_technical_bins = new_bin_words << 6;
331  size_t new_bits = bin_size_ * new_technical_bins;
332 
333  size_t idx_{new_bits}, idx{data.size()};
334  size_t delta = new_technical_bins - technical_bins + 64;
335 
336  data.resize(new_bits);
337 
338  for (size_t i = idx_, j = idx; j > 0; i -= new_technical_bins, j -= technical_bins)
339  {
340  size_t stop = i - new_technical_bins;
341 
342  for (size_t ii = i - delta, jj = j - 64; stop && ii >= stop; ii -= 64, jj -= 64)
343  {
344  uint64_t old = data.get_int(jj);
345  data.set_int(jj, 0);
346  data.set_int(ii, old);
347  }
348  }
349 
350  bin_words = new_bin_words;
351  technical_bins = new_technical_bins;
352  }
354 
370  {
372  }
374 
381  size_t hash_function_count() const noexcept
382  {
383  return hash_funs;
384  }
385 
389  size_t bin_count() const noexcept
390  {
391  return bins;
392  }
393 
397  size_t bin_size() const noexcept
398  {
399  return bin_size_;
400  }
401 
405  size_t bit_size() const noexcept
406  {
407  return data.size();
408  }
410 
419  friend bool operator==(interleaved_bloom_filter const & lhs, interleaved_bloom_filter const & rhs) noexcept
420  {
421  return std::tie(lhs.bins, lhs.technical_bins, lhs.bin_size_, lhs.hash_shift, lhs.bin_words, lhs.hash_funs,
422  lhs.data) ==
423  std::tie(rhs.bins, rhs.technical_bins, rhs.bin_size_, rhs.hash_shift, rhs.bin_words, rhs.hash_funs,
424  rhs.data);
425  }
426 
432  friend bool operator!=(interleaved_bloom_filter const & lhs, interleaved_bloom_filter const & rhs) noexcept
433  {
434  return !(lhs == rhs);
435  }
437 
445  template <cereal_archive archive_t>
446  void CEREAL_SERIALIZE_FUNCTION_NAME(archive_t & archive)
447  {
448  archive(bins);
449  archive(technical_bins);
450  archive(bin_size_);
451  archive(hash_shift);
452  archive(bin_words);
453  archive(hash_funs);
454  archive(data);
455  }
457 };
458 
469 template <data_layout data_layout_mode>
471 {
472 private:
475 
477  ibf_t const * ibf_ptr{nullptr};
478 
479 public:
480  class binning_bitvector;
481 
485  membership_agent() = default;
486  membership_agent(membership_agent const &) = default;
490  ~membership_agent() = default;
491 
496  membership_agent(ibf_t const & ibf) : ibf_ptr(std::addressof(ibf))
497  {
498  result_buffer.resize(ibf_ptr->bin_count());
499  };
501 
504 
525  [[nodiscard]] binning_bitvector const & bulk_contains(size_t const value) & noexcept
526  {
527  assert(ibf_ptr != nullptr);
528  assert(result_buffer.size() == ibf_ptr->bin_count());
529 
530  std::array<size_t, 5> bloom_filter_indices;
531  std::memcpy(&bloom_filter_indices, &ibf_ptr->hash_seeds, sizeof(size_t) * ibf_ptr->hash_funs);
532 
533  for (size_t i = 0; i < ibf_ptr->hash_funs; ++i)
534  bloom_filter_indices[i] = ibf_ptr->hash_and_fit(value, bloom_filter_indices[i]);
535 
536  for (size_t batch = 0; batch < ibf_ptr->bin_words; ++batch)
537  {
538  size_t tmp{-1ULL};
539  for (size_t i = 0; i < ibf_ptr->hash_funs; ++i)
540  {
541  assert(bloom_filter_indices[i] < ibf_ptr->data.size());
542  tmp &= ibf_ptr->data.get_int(bloom_filter_indices[i]);
543  bloom_filter_indices[i] += 64;
544  }
545 
546  result_buffer.set_int(batch << 6, tmp);
547  }
548 
549  return result_buffer;
550  }
551 
552  // `bulk_contains` cannot be called on a temporary, since the object the returned reference points to
553  // is immediately destroyed.
554  [[nodiscard]] binning_bitvector const & bulk_contains(size_t const value) && noexcept = delete;
556 
557 };
558 
560 template <data_layout data_layout_mode>
561 class interleaved_bloom_filter<data_layout_mode>::membership_agent::binning_bitvector : private sdsl::bit_vector
562 {
563 public:
567  binning_bitvector() = default;
572  ~binning_bitvector() = default;
573 
575  using sdsl::bit_vector::begin;
576  using sdsl::bit_vector::end;
577  using sdsl::bit_vector::operator==;
578  using sdsl::bit_vector::operator[];
580 
581 #if SEQAN3_DOXYGEN_ONLY(1)0
582  using iterator_t = IMPLEMENTATION_DEFINED;
585  using reference_t = IMPLEMENTATION_DEFINED;
587  using const_reference_t = IMPLEMENTATION_DEFINED;
589  iterator_t begin() noexcept;
591  iterator_t end() noexcept;
593  bool operator==(bit_vector const & other) const noexcept;
595  reference_t operator[](size_t const & idx) noexcept;
597  const_reference_t operator[](size_t const & idx) const noexcept;
599  size_t size() noexcept;
600 #endif
601 
602 private:
603  friend class membership_agent;
604  using sdsl::bit_vector::resize;
605  using sdsl::bit_vector::set_int;
606  template <std::integral value_t>
607  friend class counting_vector;
608 };
609 
632 template <std::integral value_t>
633 class counting_vector : public std::vector<value_t>
634 {
635 private:
638 public:
642  counting_vector() = default;
643  counting_vector(counting_vector const &) = default;
647  ~counting_vector() = default;
648 
649  using base_t::base_t;
651 
664  template <typename rhs_t>
668  counting_vector & operator+=(rhs_t const & rhs)
669  {
670  assert(this->size() >= rhs.size()); // The counting vector may be bigger than what we need.
671 
672  // Each iteration can handle 64 bits, so we need to iterate `((rhs.size() + 63) >> 6` many times
673  for (size_t batch = 0, bin = 0; batch < ((rhs.size() + 63) >> 6); bin = 64 * ++batch)
674  {
675  size_t tmp = rhs.get_int(batch * 64); // get 64 bits starting at position `batch * 64`
676  if (tmp ^ (1ULL<<63)) // This is a special case, because we would shift by 64 (UB) in the while loop.
677  {
678  while (tmp > 0)
679  {
680  // Jump to the next 1 and increment the corresponding vector entry.
681  uint8_t step = detail::count_trailing_zeros(tmp);
682  bin += step++;
683  tmp >>= step;
684  ++(*this)[bin++];
685  }
686  }
687  else
688  {
689  ++(*this)[bin + 63];
690  }
691  }
692  return *this;
693  }
694 
695 };
696 
698 
699 } // namespace seqan3
seqan3::interleaved_bloom_filter::membership_agent
membership_agent membership_agent() const
Returns seqan3::interleaved_bloom_filter::membership_agent to be used for lookup.
Definition: interleaved_bloom_filter.hpp:369
seqan3::interleaved_bloom_filter::membership_agent::binning_bitvector::operator=
binning_bitvector & operator=(binning_bitvector const &)=default
Defaulted.
seqan3::interleaved_bloom_filter::membership_agent::binning_bitvector::const_reference_t
IMPLEMENTATION_DEFINED const_reference_t
The const_reference type of the binning_bitvector;.
Definition: interleaved_bloom_filter.hpp:587
seqan3::data_layout
data_layout
Determines if the Interleaved Bloom Filter is compressed.
Definition: interleaved_bloom_filter.hpp:30
seqan3::interleaved_bloom_filter::membership_agent::result_buffer
binning_bitvector result_buffer
Stores the result of bulk_contains().
Definition: interleaved_bloom_filter.hpp:499
bit_manipulation.hpp
Provides utility functions for bit twiddling.
seqan3::interleaved_bloom_filter::membership_agent::operator=
membership_agent & operator=(membership_agent &&)=default
Defaulted.
seqan3::interleaved_bloom_filter::hash_function_count
size_t hash_function_count() const noexcept
Returns the number of hash functions used in the Interleaved Bloom Filter.
Definition: interleaved_bloom_filter.hpp:381
seqan3::interleaved_bloom_filter::membership_agent::binning_bitvector::operator=
binning_bitvector & operator=(binning_bitvector &&)=default
Defaulted.
seqan3::interleaved_bloom_filter::operator=
interleaved_bloom_filter & operator=(interleaved_bloom_filter &&)=default
Defaulted.
seqan3::compressed
@ compressed
The Interleaved Bloom Filter is compressed.
Definition: interleaved_bloom_filter.hpp:32
seqan3::counting_vector
A data structure that behaves like a std::vector and can be used to consolidate the results of multip...
Definition: interleaved_bloom_filter.hpp:634
std::vector
std::vector< value_t >::size
T size(T... args)
seqan3::hash_function_count
A strong type that represents the number of hash functions for the seqan3::interleaved_bloom_filter.
Definition: interleaved_bloom_filter.hpp:49
strong_type.hpp
Provides basic data structure for strong types.
seqan3::interleaved_bloom_filter
The IBF binning directory. A data structure that efficiently answers set-membership queries for multi...
Definition: interleaved_bloom_filter.hpp:127
seqan3::interleaved_bloom_filter::membership_agent::membership_agent
membership_agent(membership_agent const &)=default
Defaulted.
seqan3::interleaved_bloom_filter::membership_agent::binning_bitvector::binning_bitvector
binning_bitvector()=default
Defaulted.
seqan3::interleaved_bloom_filter::membership_agent::membership_agent
membership_agent(membership_agent &&)=default
Defaulted.
seqan3::interleaved_bloom_filter::membership_agent
Manages membership queries for the seqan3::interleaved_bloom_filter.
Definition: interleaved_bloom_filter.hpp:471
seqan3::counting_vector::counting_vector
counting_vector(counting_vector const &)=default
Defaulted.
seqan3::seed
strong_type for seed.
Definition: minimiser_hash.hpp:24
seqan3::interleaved_bloom_filter::interleaved_bloom_filter
interleaved_bloom_filter(interleaved_bloom_filter &&)=default
Defaulted.
algorithm
Adaptations of algorithms from the Ranges TS.
seqan3::counting_vector::operator+=
counting_vector & operator+=(rhs_t const &rhs)
Bin-wise adds the bits of a seqan3::interleaved_bloom_filter::membership_agent::binning_bitvector.
Definition: interleaved_bloom_filter.hpp:668
seqan3::interleaved_bloom_filter::interleaved_bloom_filter
interleaved_bloom_filter(interleaved_bloom_filter< data_layout::uncompressed > const &ibf)
Construct a compressed Interleaved Bloom Filter.
Definition: interleaved_bloom_filter.hpp:247
seqan3::interleaved_bloom_filter::~interleaved_bloom_filter
~interleaved_bloom_filter()=default
Defaulted.
std::tie
T tie(T... args)
seqan3::counting_vector::operator=
counting_vector & operator=(counting_vector const &)=default
Defaulted.
seqan3::counting_vector::operator=
counting_vector & operator=(counting_vector &&)=default
Defaulted.
seqan3::counting_vector::~counting_vector
~counting_vector()=default
Defaulted.
seqan3::bin_index
A strong type that represents the bin index for the seqan3::interleaved_bloom_filter.
Definition: interleaved_bloom_filter.hpp:55
seqan3::interleaved_bloom_filter::bit_size
size_t bit_size() const noexcept
Returns the size of the underlying bitvector.
Definition: interleaved_bloom_filter.hpp:405
seqan3::interleaved_bloom_filter::operator!=
friend bool operator!=(interleaved_bloom_filter const &lhs, interleaved_bloom_filter const &rhs) noexcept
Test for inequality.
Definition: interleaved_bloom_filter.hpp:432
seqan3::interleaved_bloom_filter::membership_agent::binning_bitvector::begin
iterator_t begin() noexcept
Returns an iterator to the begin of the bitvector.
std::array< size_t, 5 >
seqan3
The main SeqAn3 namespace.
Definition: aligned_sequence_concept.hpp:29
seqan3::bin_count
A strong type that represents the number of bins for the seqan3::interleaved_bloom_filter.
Definition: interleaved_bloom_filter.hpp:37
std::logic_error
std::invalid_argument
seqan3::bin_size
A strong type that represents the number of bits for each bin in the seqan3::interleaved_bloom_filter...
Definition: interleaved_bloom_filter.hpp:43
seqan3::interleaved_bloom_filter::bin_count
size_t bin_count() const noexcept
Returns the number of bins that the Interleaved Bloom Filter manages.
Definition: interleaved_bloom_filter.hpp:389
seqan3::interleaved_bloom_filter::bin_size
size_t bin_size() const noexcept
Returns the size of a single bin that the Interleaved Bloom Filter manages.
Definition: interleaved_bloom_filter.hpp:397
seqan3::interleaved_bloom_filter::operator=
interleaved_bloom_filter & operator=(interleaved_bloom_filter const &)=default
Defaulted.
seqan3::interleaved_bloom_filter::membership_agent::binning_bitvector::size
size_t size() noexcept
Returns the size of the bitvector.
seqan3::pack_traits::size
constexpr size_t size
The size of a type pack.
Definition: traits.hpp:116
seqan3::interleaved_bloom_filter::increase_bin_number_to
void increase_bin_number_to(bin_count const new_bins_)
Increases the number of bins stored in the Interleaved Bloom Filter.
Definition: interleaved_bloom_filter.hpp:312
seqan3::interleaved_bloom_filter::membership_agent::membership_agent
membership_agent()=default
Defaulted.
std
SeqAn specific customisations in the standard namespace.
seqan3::interleaved_bloom_filter::interleaved_bloom_filter
interleaved_bloom_filter()=default
Defaulted.
seqan3::interleaved_bloom_filter::membership_agent::binning_bitvector
A bitvector representing the result of a call to bulk_contains of the seqan3::interleaved_bloom_filte...
Definition: interleaved_bloom_filter.hpp:562
seqan3::interleaved_bloom_filter::interleaved_bloom_filter
interleaved_bloom_filter(interleaved_bloom_filter const &)=default
Defaulted.
seqan3::counting_vector::counting_vector
counting_vector()=default
Defaulted.
seqan3::interleaved_bloom_filter::membership_agent::binning_bitvector::reference_t
IMPLEMENTATION_DEFINED reference_t
The reference type of the binning_bitvector;.
Definition: interleaved_bloom_filter.hpp:585
seqan3::interleaved_bloom_filter::membership_agent::binning_bitvector::binning_bitvector
binning_bitvector(binning_bitvector &&)=default
Defaulted.
seqan3::interleaved_bloom_filter::operator==
friend bool operator==(interleaved_bloom_filter const &lhs, interleaved_bloom_filter const &rhs) noexcept
Test for equality.
Definition: interleaved_bloom_filter.hpp:419
cereal.hpp
Adaptions of concepts from the Cereal library.
std::memcpy
T memcpy(T... args)
seqan3::interleaved_bloom_filter::membership_agent::binning_bitvector::~binning_bitvector
~binning_bitvector()=default
seqan3::interleaved_bloom_filter::membership_agent::binning_bitvector::binning_bitvector
binning_bitvector(binning_bitvector const &)=default
Defaulted.
seqan3::interleaved_bloom_filter::membership_agent::~membership_agent
~membership_agent()=default
Defaulted.
std::conditional_t
seqan3::interleaved_bloom_filter::membership_agent::binning_bitvector::iterator_t
IMPLEMENTATION_DEFINED iterator_t
The iterator type of the binning_bitvector;.
Definition: interleaved_bloom_filter.hpp:583
seqan3::interleaved_bloom_filter::interleaved_bloom_filter
interleaved_bloom_filter(seqan3::bin_count bins_, seqan3::bin_size size, seqan3::hash_function_count funs=seqan3::hash_function_count{2u})
Construct an uncompressed Interleaved Bloom Filter.
Definition: interleaved_bloom_filter.hpp:212
seqan3::uncompressed
@ uncompressed
The Interleaved Bloom Filter is uncompressed.
Definition: interleaved_bloom_filter.hpp:31
seqan3::interleaved_bloom_filter::data_layout_mode
static constexpr data_layout data_layout_mode
Indicates whether the Interleaved Bloom Filter is compressed.
Definition: interleaved_bloom_filter.hpp:187
seqan3::interleaved_bloom_filter::membership_agent::operator=
membership_agent & operator=(membership_agent const &)=default
Defaulted.
seqan3::interleaved_bloom_filter::emplace
void emplace(size_t const value, bin_index const bin)
Inserts a value into a specific bin.
Definition: interleaved_bloom_filter.hpp:274
seqan3::interleaved_bloom_filter::membership_agent::bulk_contains
binning_bitvector const & bulk_contains(size_t const value) &noexcept
Determines set membership of a given value.
Definition: interleaved_bloom_filter.hpp:525
std::array::data
T data(T... args)
seqan3::counting_vector::counting_vector
counting_vector(counting_vector &&)=default
Defaulted.
std::is_base_of