HIBF 1.0.0-rc.1
All Classes Namespaces Files Functions Variables Typedefs Friends Macros Modules Pages Concepts
index

The HIBF Cookbook

This document provides example recipes on how to carry out particular tasks using the HIBF functionalities in C++. Please note that these recipes are not ordered. You can use the links in the table of contents or the search function of your browser to navigate them.

It will take some time, but we hope to expand this document into containing numerous great examples. If you have suggestions for how to improve the Cookbook and/or examples you would like included, please feel free to contact us.

All HIBF documentation snippets

The following lists all snippets that appear in our documentation. Search for keywords with Strg + F.

// SPDX-FileCopyrightText: 2006-2025, Knut Reinert & Freie Universität Berlin
// SPDX-FileCopyrightText: 2016-2025, Knut Reinert & MPI für molekulare Genetik
// SPDX-License-Identifier: CC0-1.0
#include <cstddef> // for size_t
#include <functional> // for function
#include <hibf/config.hpp> // for config, insert_iterator
int main()
{
auto my_input = [&](size_t const /* user_bin_id */, seqan::hibf::insert_iterator it) // fixed parameters!
{
it = 42; // assign something that is convertible to uint64_t
};
seqan::hibf::config config{.input_fn = my_input};
}
Definition insert_iterator.hpp:25
The configuration used to build an (H)IBF.
Definition config.hpp:75
std::function< void(size_t const, insert_iterator &&) input_fn)
A function for how to hash your input [REQUIRED].
Definition config.hpp:110
// SPDX-FileCopyrightText: 2006-2025, Knut Reinert & Freie Universität Berlin
// SPDX-FileCopyrightText: 2016-2025, Knut Reinert & MPI für molekulare Genetik
// SPDX-License-Identifier: CC0-1.0
#include <cstddef> // for size_t
#include <cstdint> // for uint64_t
#include <filesystem> // for path
#include <fstream> // for basic_ifstream, getline, ifstream
#include <functional> // for function
#include <string> // for basic_string, string
#include <string_view> // for basic_string_view
#include <vector> // for vector
#include <hibf/test/temporary_snippet_file.hpp> // for temporary_snippet_file
seqan::hibf::test::temporary_snippet_file file1{"file1.fa", "ACGT"};
seqan::hibf::test::temporary_snippet_file file2{"file2.fa", "ACGT"};
#include <hibf/config.hpp> // for insert_iterator, config
int main()
{
std::vector<std::filesystem::path> filenames{file1.path(), file2.path()};
auto my_input = [&](size_t const user_bin_id, seqan::hibf::insert_iterator it)
{
std::ifstream infile{filenames[user_bin_id]};
std::getline(infile, line); // assume there is a sequence in the first line
// Look at https://docs.seqan.de/seqan3/3-master-user/group__io__sequence__file.html for e.g. FASTA File I/O
for (size_t i = 0; i < line.size() - 1; ++i)
{
// compute 2-mer hashes based on the character value
uint64_t hash = 4 * line[i] + line[i + 1];
// You can also look at the seqan3::kmer_hash view for hashing
// https://docs.seqan.de/seqan3/3-master-user/group__search__views.html#ga6e598d6a021868f704d39df73252974f
it = hash;
}
};
seqan::hibf::config config{.input_fn = my_input};
}
T getline(T... args)
T size(T... args)
// SPDX-FileCopyrightText: 2006-2025, Knut Reinert & Freie Universität Berlin
// SPDX-FileCopyrightText: 2016-2025, Knut Reinert & MPI für molekulare Genetik
// SPDX-License-Identifier: CC0-1.0
#include <cstddef> // for size_t
#include <cstdint> // for uint64_t
#include <functional> // for function
#include <vector> // for vector
#include <hibf/config.hpp> // for insert_iterator, config
struct dna
{
int rank{0}; // 0 = A, C = 1, G = 2, T = 3
};
int main()
{
// user_bins stores one dna sequence per user bin
// You can look at https://docs.seqan.de/seqan3/3-master-user/group__alphabet__nucleotide.html for dna alphabets
std::vector<std::vector<dna>> user_bins{{{0}, {0}, {0} /*AAA*/}, {{1}, {1}, {1} /*CCC*/}};
auto my_input = [&](size_t const user_bin_id, seqan::hibf::insert_iterator it)
{
auto const & seq = user_bins[user_bin_id];
for (size_t i = 0; i < seq.size() - 1; ++i)
{
// compute 2-mer hashes
uint64_t hash = 4 * seq[i].rank + seq[i + 1].rank;
// You can also look at the seqan3::kmer_hash view for hashing
// https://docs.seqan.de/seqan3/3-master-user/group__search__views.html#ga6e598d6a021868f704d39df73252974f
it = hash;
}
};
seqan::hibf::config config{.input_fn = my_input};
}
// SPDX-FileCopyrightText: 2006-2025, Knut Reinert & Freie Universität Berlin
// SPDX-FileCopyrightText: 2016-2025, Knut Reinert & MPI für molekulare Genetik
// SPDX-License-Identifier: CC0-1.0
#include <cstddef> // for size_t
#include <functional> // for function
#include <hibf/config.hpp> // for config, insert_iterator
int main()
{
auto my_input = [&](size_t const /* user_bin_id */, seqan::hibf::insert_iterator it) // fixed parameters!
{
it = 42; // assign something that is convertible to uint64_t
};
seqan::hibf::config config{.input_fn = my_input, .number_of_user_bins = 12};
}
// SPDX-FileCopyrightText: 2006-2025, Knut Reinert & Freie Universität Berlin
// SPDX-FileCopyrightText: 2016-2025, Knut Reinert & MPI für molekulare Genetik
// SPDX-License-Identifier: CC0-1.0
#include <cstddef> // for size_t
#include <functional> // for function
#include <vector> // for vector
#include <hibf/config.hpp> // for config, insert_iterator
#include <hibf/hierarchical_interleaved_bloom_filter.hpp> // for hierarchical_interleaved_bloom_filter
int main()
{
// 2 user bins:
std::vector<std::vector<size_t>> hashes{{1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u}, {1u, 2u, 3u, 4u, 5u}};
// input just passes hashes:
auto my_input = [&](size_t const user_bin_id, seqan::hibf::insert_iterator it)
{
for (auto const hash : hashes[user_bin_id])
it = hash;
};
seqan::hibf::config config{.input_fn = my_input, // required
.number_of_user_bins = 2, // required
.number_of_hash_functions = 2,
.maximum_fpr = 0.05, // recommended to adapt
.threads = 1, // recommended to adapt
.sketch_bits = 12,
.tmax = 0, // triggers default copmutation
.alpha = 1.2,
.max_rearrangement_ratio = 0.5,
.disable_estimate_union = false,
.disable_rearrangement = false};
// construct the HIBF
}
The Hierarchical Interleaved Bloom Filter (HIBF) - Fast answers to set-membership queries for multipl...
Definition hierarchical_interleaved_bloom_filter.hpp:140
// SPDX-FileCopyrightText: 2006-2025, Knut Reinert & Freie Universität Berlin
// SPDX-FileCopyrightText: 2016-2025, Knut Reinert & MPI für molekulare Genetik
// SPDX-License-Identifier: CC0-1.0
#include <cstddef> // for size_t
#include <functional> // for function
#include <vector> // for vector
#include <hibf/config.hpp> // for config, insert_iterator
#include <hibf/hierarchical_interleaved_bloom_filter.hpp> // for hierarchical_interleaved_bloom_filter
#include <hibf/misc/print.hpp> // for print, print_t
int main()
{
// 2 user bins:
std::vector<std::vector<size_t>> hashes{{1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u}, {1u, 2u, 3u, 4u, 5u}};
// input just passes hashes:
auto my_input = [&](size_t const user_bin_id, seqan::hibf::insert_iterator it)
{
for (auto const hash : hashes[user_bin_id])
it = hash;
};
seqan::hibf::config config{.input_fn = my_input, .number_of_user_bins = 2};
// construct the HIBF
// query the HIBF
std::vector<size_t> query{1u, 2u, 3u};
std::vector<size_t> query2{8u, 9u, 10u};
auto agent = hibf.membership_agent(); // you need an agent for efficient queries
auto & result = agent.membership_for(query, 2u); // both user bins have hashes 1,2,3
seqan::hibf::print(result); // [1,0]
agent.sort_results(); // Results can also be sorted
seqan::hibf::print(result); // [0,1]
auto & result2 = agent.membership_for(query2, 2u); // only user bin 0 has hashes 8,9,10
seqan::hibf::print(result2); // [0]
}
// SPDX-FileCopyrightText: 2006-2025, Knut Reinert & Freie Universität Berlin
// SPDX-FileCopyrightText: 2016-2025, Knut Reinert & MPI für molekulare Genetik
// SPDX-License-Identifier: CC0-1.0
#include <hibf/interleaved_bloom_filter.hpp> // for interleaved_bloom_filter, bin_index, bin_count, bin_size
#include <hibf/misc/print.hpp> // for print, print_t
int main()
{
ibf.emplace(126, seqan::hibf::bin_index{0u});
ibf.emplace(712, seqan::hibf::bin_index{3u});
ibf.emplace(237, seqan::hibf::bin_index{9u});
// Query the Interleaved Bloom Filter. Note that there may be false positive results!
// A `1` at position `i` indicates the (probable) presence of the query in bin `i`.
// Capture the result by reference to avoid copies.
auto agent = ibf.containment_agent();
auto & result = agent.bulk_contains(712);
seqan::hibf::print(result); // [0,0,0,1,0,0,0,0,0,0,0,0]
// Calling `increase_bin_number_to` invalidates the agent.
ibf.increase_bin_number_to(seqan::hibf::bin_count{60u});
// So make sure to construct a new containment_agent.
agent = ibf.containment_agent();
}
The IBF binning directory. A data structure that efficiently answers set-membership queries for multi...
Definition interleaved_bloom_filter.hpp:142
Provides seqan::hibf::interleaved_bloom_filter.
A strong type that represents the number of bins for the seqan::hibf::interleaved_bloom_filter.
Definition interleaved_bloom_filter.hpp:47
A strong type that represents the bin index for the seqan::hibf::interleaved_bloom_filter.
Definition interleaved_bloom_filter.hpp:74
A strong type that represents the number of bits for each bin in the seqan::hibf::interleaved_bloom_f...
Definition interleaved_bloom_filter.hpp:56
// SPDX-FileCopyrightText: 2006-2025, Knut Reinert & Freie Universität Berlin
// SPDX-FileCopyrightText: 2016-2025, Knut Reinert & MPI für molekulare Genetik
// SPDX-License-Identifier: CC0-1.0
int main()
{
// Construct an Interleaved Bloom Filter to be used with the containment_agent.
// The containment_agent can now be constructed by calling `containment_agent` on the Interleaved Bloom Filter.
auto agent = ibf.containment_agent();
// Calling `increase_bin_number_to` invalidates the agent.
ibf.increase_bin_number_to(seqan::hibf::bin_count{60u});
// So make sure to construct a new containment_agent.
agent = ibf.containment_agent();
}
A strong type that represents the number of hash functions for the seqan::hibf::interleaved_bloom_fil...
Definition interleaved_bloom_filter.hpp:65
// SPDX-FileCopyrightText: 2006-2025, Knut Reinert & Freie Universität Berlin
// SPDX-FileCopyrightText: 2016-2025, Knut Reinert & MPI für molekulare Genetik
// SPDX-License-Identifier: CC0-1.0
#include <cstddef> // for size_t
#include <cstdint> // for uint8_t
#include <ranges> // for iota_view, __fn, iota, views
#include <vector> // for vector
#include <hibf/interleaved_bloom_filter.hpp> // for interleaved_bloom_filter, bin_index, bin_count, bin_size, hash_...
#include <hibf/misc/print.hpp> // for print, print_t
int main()
{
auto sequence1 = std::views::iota(0u, 20u);
auto sequence2 = std::views::iota(10u, 30u);
auto sequence3 = std::views::iota(25u, 35u);
// Insert all values of sequence1 into bin 0
for (auto && value : sequence1)
ibf.emplace(value, seqan::hibf::bin_index{0u});
// Insert all values of sequence2 into bin 4
for (auto && value : sequence2)
ibf.emplace(value, seqan::hibf::bin_index{4u});
// Insert all values of sequence3 into bin 7
for (auto && value : sequence3)
ibf.emplace(value, seqan::hibf::bin_index{7u});
auto agent = ibf.counting_agent();
// Count all values of sequence1 for all bins
auto & result = agent.bulk_count(sequence1); // Bind by `&` to avoid copies!
seqan::hibf::print(result); // [20,0,0,0,10,0,0,0]
// Search for specific values
std::vector<size_t> const values{92, 1238, 812, 81273};
seqan::hibf::print(agent.bulk_count(values)); // [0,0,0,0,0,0,0,0]
seqan::hibf::print(agent.bulk_count(std::views::iota(0u, 1024u))); // [20,0,0,0,20,0,0,10]
// The default counters are 16 bit unsigned integer.
// An optional template parameter can be used to specify the counter type
auto agent2 = ibf.counting_agent<uint8_t>();
// The returned counts are now 8 bit unsigned integers.
seqan::hibf::print(agent2.bulk_count(sequence1)); // [20,0,0,0,10,0,0,0]
}
// SPDX-FileCopyrightText: 2006-2025, Knut Reinert & Freie Universität Berlin
// SPDX-FileCopyrightText: 2016-2025, Knut Reinert & MPI für molekulare Genetik
// SPDX-License-Identifier: CC0-1.0
int main()
{
// Construct an Interleaved Bloom Filter to be used with the counting_agent.
// The counting_agent can now be constructed by calling `counting_agent` on the Interleaved Bloom Filter.
auto agent = ibf.counting_agent();
// Calling `increase_bin_number_to` invalidates the agent.
ibf.increase_bin_number_to(seqan::hibf::bin_count{60u});
// So make sure to construct a new counting_agent.
agent = ibf.counting_agent();
}
// SPDX-FileCopyrightText: 2006-2025, Knut Reinert & Freie Universität Berlin
// SPDX-FileCopyrightText: 2016-2025, Knut Reinert & MPI für molekulare Genetik
// SPDX-License-Identifier: CC0-1.0
#include <cstdint> // for uint16_t
#include <hibf/interleaved_bloom_filter.hpp> // for interleaved_bloom_filter, bin_index, bin_count, bin_size
#include <hibf/misc/counting_vector.hpp> // for counting_vector
#include <hibf/misc/print.hpp> // for print, print_t
int main()
{
ibf.emplace(126, seqan::hibf::bin_index{0u});
ibf.emplace(126, seqan::hibf::bin_index{3u});
ibf.emplace(126, seqan::hibf::bin_index{9u});
ibf.emplace(712, seqan::hibf::bin_index{3u});
ibf.emplace(237, seqan::hibf::bin_index{9u});
// The counting_vector must be at least as big as the number of bins.
auto agent = ibf.containment_agent();
counts += agent.bulk_contains(712); // `counts` contains the number of occurrences of 712 in each bin.
seqan::hibf::print(counts); // prints [0,0,0,1,0,0,0,0,0,0,0,0]
counts += agent.bulk_contains(237); // `counts` contains the number of occurrences of 712 and 237 in each bin.
seqan::hibf::print(counts); // prints [0,0,0,1,0,0,0,0,0,1,0,0]
counts += agent.bulk_contains(126); // `counts` contains the number of occurrences of 712, 237 and 126 in each bin.
seqan::hibf::print(counts); // prints [1,0,0,2,0,0,0,0,0,2,0,0]
counts += counts; // multiple counts can also be added together
seqan::hibf::print(counts); // prints [2,0,0,4,0,0,0,0,0,4,0,0]
}
A data structure that behaves like a std::vector and can be used to consolidate the results of multip...
Definition counting_vector.hpp:146
Provides seqan::hibf::counting_vector.
// SPDX-FileCopyrightText: 2006-2025, Knut Reinert & Freie Universität Berlin
// SPDX-FileCopyrightText: 2016-2025, Knut Reinert & MPI für molekulare Genetik
// SPDX-License-Identifier: CC0-1.0
#include <ranges> // for iota_view, operator==, _Iota, iota, views
#include <vector> // for vector
#include <hibf/interleaved_bloom_filter.hpp> // for bin_index, interleaved_bloom_filter, bin_count, bin_size, hash_...
#include <hibf/misc/print.hpp> // for print, print_t
int main()
{
auto sequence1 = std::views::iota(0u, 20u);
auto sequence2 = std::views::iota(10u, 30u);
auto sequence3 = std::views::iota(25u, 35u);
// Insert all values of sequence1 into bin 0
for (auto && value : sequence1)
ibf.emplace(value, seqan::hibf::bin_index{0u});
// Insert all values of sequence2 into bin 4
for (auto && value : sequence2)
ibf.emplace(value, seqan::hibf::bin_index{4u});
// Insert all values of sequence3 into bin 7
for (auto && value : sequence3)
ibf.emplace(value, seqan::hibf::bin_index{7u});
auto agent = ibf.counting_agent();
// Count all values of sequence1 for all bins
seqan::hibf::print(agent.bulk_count(sequence1)); // [20,0,0,0,10,0,0,0]
// Clear bin 0
ibf.clear(seqan::hibf::bin_index{0u});
// After clearing, no values are found in bin 0
seqan::hibf::print(agent.bulk_count(sequence1)); // [0,0,0,0,10,0,0,0]
// Search for specific values
seqan::hibf::print(agent.bulk_count(std::views::iota(0u, 1024u))); // [0,0,0,0,20,0,0,10]
// Clear bin 4 and 7
// After clearing, nothing is found
seqan::hibf::print(agent.bulk_count(std::views::iota(0u, 1024u))); // [0,0,0,0,0,0,0,0]
}
// SPDX-FileCopyrightText: 2006-2025, Knut Reinert & Freie Universität Berlin
// SPDX-FileCopyrightText: 2016-2025, Knut Reinert & MPI für molekulare Genetik
// SPDX-License-Identifier: CC0-1.0
int main()
{
// Construct an Interleaved Bloom Filter that contains 43 bins, each using 8192 bits, and 3 hash functions.
// Construct an Interleaved Bloom Filter that contains 43 bins, each using 256 KiBits,
// and the default of 2 hash functions.
}
// SPDX-FileCopyrightText: 2006-2025, Knut Reinert & Freie Universität Berlin
// SPDX-FileCopyrightText: 2016-2025, Knut Reinert & MPI für molekulare Genetik
// SPDX-License-Identifier: CC0-1.0
int main()
{
// Insert the values `126`, `712` and `237` into bins `0`, `3` and `9` of the Interleaved Bloom Filter.
ibf.emplace(126, seqan::hibf::bin_index{0u});
ibf.emplace(712, seqan::hibf::bin_index{3u});
ibf.emplace(237, seqan::hibf::bin_index{9u});
}
// SPDX-FileCopyrightText: 2006-2025, Knut Reinert & Freie Universität Berlin
// SPDX-FileCopyrightText: 2016-2025, Knut Reinert & MPI für molekulare Genetik
// SPDX-License-Identifier: CC0-1.0
#include <hibf/interleaved_bloom_filter.hpp> // for interleaved_bloom_filter, bin_index, bin_count, bin_size
#include <hibf/misc/print.hpp> // for print, print_t
int main()
{
ibf.emplace(126, seqan::hibf::bin_index{0u});
ibf.emplace(712, seqan::hibf::bin_index{3u});
ibf.emplace(237, seqan::hibf::bin_index{9u});
ibf.increase_bin_number_to(seqan::hibf::bin_count{18u});
// Be sure to get the agent after `increase_bin_number_to` as it invalidates all agents!
auto agent = ibf.containment_agent();
// The content of the bins which were already present before the resize does not change
seqan::hibf::print(agent.bulk_contains(126)); // [1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
seqan::hibf::print(agent.bulk_contains(712)); // [0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
seqan::hibf::print(agent.bulk_contains(237)); // [0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0]
}
// SPDX-FileCopyrightText: 2006-2025, Knut Reinert & Freie Universität Berlin
// SPDX-FileCopyrightText: 2016-2025, Knut Reinert & MPI für molekulare Genetik
// SPDX-License-Identifier: CC0-1.0
#include <iostream> // for basic_ostream, operator<<, cout, boolalpha, basic_ios
#include <hibf/interleaved_bloom_filter.hpp> // for interleaved_bloom_filter, bin_count, bin_index, bin_size
#include <hibf/misc/print.hpp> // for print, print_t
int main()
{
ibf.emplace(126, seqan::hibf::bin_index{0u});
ibf.emplace(712, seqan::hibf::bin_index{3u});
ibf.emplace(237, seqan::hibf::bin_index{9u});
// Same bin count has no effect and returns `true`.
bool result = ibf.try_increase_bin_number_to(seqan::hibf::bin_count{73u});
std::cout << std::boolalpha << result << '\n'; // true
std::cout << ibf.bin_count() << '\n'; // 73
// Smaller bin count has no effect and returns `false`.
result = ibf.try_increase_bin_number_to(seqan::hibf::bin_count{50u});
std::cout << std::boolalpha << result << '\n'; // false
std::cout << ibf.bin_count() << '\n'; // 73
// Larger bin count and resize not required increases the bin count and returns `true`.
result = ibf.try_increase_bin_number_to(seqan::hibf::bin_count{128u});
std::cout << std::boolalpha << result << '\n'; // true
std::cout << ibf.bin_count() << '\n'; // 128
// Resize would be required, hence returns `false`.
result = ibf.try_increase_bin_number_to(seqan::hibf::bin_count{129u});
std::cout << std::boolalpha << result << '\n'; // false
std::cout << ibf.bin_count() << '\n'; // 128
// Be sure to get the agent after `try_increase_bin_number_to` as it may invalidate all agents!
auto agent = ibf.containment_agent();
// The content of the bins which were already present before the resize does not change
seqan::hibf::print(agent.bulk_contains(126)); // [1,0,0,0,0,0,0,0,0,0,0,...,0]
seqan::hibf::print(agent.bulk_contains(712)); // [0,0,0,1,0,0,0,0,0,0,0,...,0]
seqan::hibf::print(agent.bulk_contains(237)); // [0,0,0,0,0,0,0,0,0,1,0,...,0]
}
T boolalpha(T... args)
// SPDX-FileCopyrightText: 2006-2025, Knut Reinert & Freie Universität Berlin
// SPDX-FileCopyrightText: 2016-2025, Knut Reinert & MPI für molekulare Genetik
// SPDX-License-Identifier: CC0-1.0
int main()
{
// Construct an Interleaved Bloom Filter to be used with the membership_agent.
// The membership_agent can now be constructed by calling `membership_agent` on the Interleaved Bloom Filter.
auto agent = ibf.membership_agent();
// Calling `increase_bin_number_to` invalidates the agent.
ibf.increase_bin_number_to(seqan::hibf::bin_count{60u});
// So make sure to construct a new membership_agent.
agent = ibf.membership_agent();
}
// SPDX-FileCopyrightText: 2006-2025, Knut Reinert & Freie Universität Berlin
// SPDX-FileCopyrightText: 2016-2025, Knut Reinert & MPI für molekulare Genetik
// SPDX-License-Identifier: CC0-1.0
#include <cstdint> // for uint64_t
#include <vector> // for vector
#include <hibf/interleaved_bloom_filter.hpp> // for interleaved_bloom_filter, bin_index, bin_count, bin_size
#include <hibf/misc/print.hpp> // for print, print_t
int main()
{
std::vector<uint64_t> const query{126, 712, 237};
for (auto && value : query)
{
ibf.emplace(value, seqan::hibf::bin_index{3u});
ibf.emplace(value, seqan::hibf::bin_index{5u});
}
ibf.emplace(126, seqan::hibf::bin_index{7u});
ibf.emplace(712, seqan::hibf::bin_index{7u});
ibf.emplace(956, seqan::hibf::bin_index{9u});
auto agent = ibf.membership_agent();
// Returns all bin indices that contain at least 2 elements of the query.
// Capture the result by reference to avoid copies.
auto & result = agent.membership_for(query, 2u);
seqan::hibf::print(result); // [3, 5, 7]
// Calling `increase_bin_number_to` invalidates the agent.
ibf.increase_bin_number_to(seqan::hibf::bin_count{60u});
// So make sure to construct a new membership_agent.
agent = ibf.membership_agent();
}
T emplace(T... args)
// SPDX-FileCopyrightText: 2006-2025, Knut Reinert & Freie Universität Berlin
// SPDX-FileCopyrightText: 2016-2025, Knut Reinert & MPI für molekulare Genetik
// SPDX-License-Identifier: CC0-1.0
#include <cstddef> // for size_t
#include <cstdint> // for uint64_t
#include <functional> // for function
#include <iostream> // for basic_ostream, operator<<, cout
#include <ranges> // for __fn, iota, views
#include <vector> // for vector
#include <hibf/config.hpp> // for insert_iterator, config
#include <hibf/hierarchical_interleaved_bloom_filter.hpp> // for hierarchical_interleaved_bloom_filter
int main()
{
// Let's say we have groups that have data that we find interesting.
// For example, each file of the RefSeq data set could be such a group.
// In the context of the HIBF, we call such groups user bins.
// Given a query, we want to quickly determine which user bins this query is likely to occur in.
// This is also called Approximate Membership Query (AMQ).
// In this example, we have three user bins. Each of these user bins is characterized by a range of
// unsigned integer values. Some popular techniques for obtaining such unsigned integers from
// biological sequences include k-mers, minimisers, and syncmers.
// For clarity, we show each user bin individually before copying them to user_bin_data.
std::vector<uint64_t> user_bin_1{1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u};
std::vector<uint64_t> user_bin_2{1u, 2u, 3u, 4u, 5u};
std::vector<uint64_t> user_bin_3{3u, 9u, 11u};
std::vector<std::vector<uint64_t>> user_bin_data{user_bin_1, user_bin_2, user_bin_3};
// The HIBF uses a config. There are two required options:
// 1) The number of user bins: 3 (user_bin_data.size())
// 2) A function to access the input data.
// The signature is (size_t const user_bin_id, seqan::hibf::insert_iterator it). You need to
// provide the function body, and the hibf lib will use this function to access the data of each
// user bin. When this function is called by the library with a specific user_bin_id, all
// unsigned integer values (data) belonging to this user bin have to be assigned to the
// seqan::hibf::insert_iterator.
// Conveniently, this function can be a lambda, and hence capture data outside the function body.
auto get_user_bin_data = [&](size_t const user_bin_id, seqan::hibf::insert_iterator it)
{
for (auto value : user_bin_data[user_bin_id])
it = value;
};
// Now we can construct a config, any other settings are optional. We have included some interesting
// settings with their respective default values here.
seqan::hibf::config config{.input_fn = get_user_bin_data, // required
.number_of_user_bins = 3u, // required
.number_of_hash_functions = 2u,
.maximum_fpr = 0.05,
.threads = 1u};
// The HIBF constructor will determine a hierarchical layout for the user bins and build the filter.
// Now we can search for some query.
std::vector<uint64_t> query1{3u, 9u, 12u, 14u};
// For this, we use the membership agent of the HIBF. This agent only needs to be created once and
// can be reused for multiple subsequent queries.
// If you are using multiple threads in your app, each thread should have its own membership agent.
auto agent = hibf.membership_agent();
// The membership_for function takes the query and a threshold. Here, a threshold of two means that
// at least (>=) 2 values of the query must be found within a user bin to be a hit.
// While exact thresholds can be obtained for some approaches such as k-mers, another popular
// approach is to require at least x% of the values in the query to hit.
// For example, a threshold of 2 equals 40% of the values in query1 (5 values).
// This threshold needs to be provided by the user. In general, some care should be taken with the
// threshold. A low threshold requires a traversal of more parts of the hierarchy and slows down
// the search.
// Note that we bind the result with a `&` to avoid copies!
auto & result1 = agent.membership_for(query1, 2u);
// query1 hits in user_bin_1 and user_bin_3, which have the IDs 0 and 2, respectively.
for (uint64_t hit_user_bin : result1)
std::cout << hit_user_bin << ' '; // The results are not sorted: 2 0
std::cout << '\n';
// Another query.
// A query is simply a range of unsigned integer values, e.g., it does not have to be a vector.
auto query2 = std::views::iota(0u, 15u); // 0,1,2,...,14
auto & result2 = agent.membership_for(query2, 5u);
agent.sort_results(); // Sort the results.
// query2 hits in user_bin_1 and user_bin_2, which have the IDs 0 and 1, respectively.
for (uint64_t hit_user_bin : result2)
std::cout << hit_user_bin << ' '; // The results are sorted: 0 1
std::cout << '\n';
}
// SPDX-FileCopyrightText: 2006-2025, Knut Reinert & Freie Universität Berlin
// SPDX-FileCopyrightText: 2016-2025, Knut Reinert & MPI für molekulare Genetik
// SPDX-License-Identifier: BSD-3-Clause
int main()
{
return 0;
}
// SPDX-FileCopyrightText: 2006-2025, Knut Reinert & Freie Universität Berlin
// SPDX-FileCopyrightText: 2016-2025, Knut Reinert & MPI für molekulare Genetik
// SPDX-License-Identifier: CC0-1.0
#include <cassert> // for assert
#include <filesystem> // for remove
#include <fstream> // for char_traits, basic_ofstream, basic_ostream, operator<<, ofstream
#include <hibf/test/sandboxed_path.hpp> // for operator/, sandboxed_path
#include <hibf/test/tmp_directory.hpp> // for tmp_directory
int main()
{
// create a directory folder
seqan::hibf::test::tmp_directory tmp{};
// Some function that should creates temporary files and removes them again
{
std::ofstream ofs{tmp.path() / "somefile.txt"};
ofs << "Hello World!";
ofs.close();
std::filesystem::remove(tmp.path() / "somefile.txt");
}
// check that everything was cleaned up properly
assert(tmp.empty());
}
T close(T... args)
T remove(T... args)
// SPDX-FileCopyrightText: 2006-2025, Knut Reinert & Freie Universität Berlin
// SPDX-FileCopyrightText: 2016-2025, Knut Reinert & MPI für molekulare Genetik
// SPDX-License-Identifier: CC0-1.0
#include <hibf/misc/unreachable.hpp> // for unreachable
int foo(int const i)
{
// The compiler will not generate the default branch.
// Note that an input of any `i` other than `0` and `1` is undefined behavior!
switch (i)
{
case 0:
return -5;
case 1:
return 3;
default:
seqan::hibf::unreachable();
}
}
Provides seqan::hibf::unreachable.