Chopper
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
compute_ibf_size.hpp
Go to the documentation of this file.
1// ---------------------------------------------------------------------------------------------------
2// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin
3// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik
4// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5// shipped with this file and also available at: https://github.com/seqan/chopper/blob/main/LICENSE.md
6// ---------------------------------------------------------------------------------------------------
7
8#pragma once
9
10#include <cstdint>
11
12#include <hibf/build/bin_size_in_bits.hpp>
13#include <hibf/build/build_data.hpp>
14#include <hibf/contrib/robin_hood.hpp>
15#include <hibf/layout/graph.hpp>
16#include <hibf/misc/divide_and_ceil.hpp>
17
18void update_parent_kmers(robin_hood::unordered_flat_set<uint64_t> & parent_kmers,
19 robin_hood::unordered_flat_set<uint64_t> const & kmers)
20{
21 parent_kmers.insert(kmers.begin(), kmers.end());
22}
23
24// this function is copied from seqan::hibf::build::construct_ibf
25// it needs to be held consistent in order to compute the correct sizes
26size_t compute_ibf_size(robin_hood::unordered_flat_set<uint64_t> & parent_kmers,
27 robin_hood::unordered_flat_set<uint64_t> & kmers,
28 size_t const number_of_bins,
29 seqan::hibf::layout::graph::node const & ibf_node,
30 seqan::hibf::build::build_data & data,
31 size_t const current_hibf_level)
32{
33 bool const max_bin_is_merged = ibf_node.max_bin_is_merged();
34 assert(!max_bin_is_merged || number_of_bins == 1u); // merged max bin implies (=>) number of bins == 1
35
36 size_t const kmers_per_bin = seqan::hibf::divide_and_ceil(kmers.size(), number_of_bins);
37 double const fpr = max_bin_is_merged ? data.config.relaxed_fpr : data.config.maximum_fpr;
38
39 size_t const bin_bits{seqan::hibf::build::bin_size_in_bits({.fpr = fpr, //
40 .hash_count = data.config.number_of_hash_functions,
41 .elements = kmers_per_bin})};
42 // data.fpr_correction[1] == 1.0, but we can avoid floating point operations with the ternary.
43 // Check number_of_bins instead of max_bin_is_merged, because split bins can also occupy only one technical bin.
44 size_t const bin_size{number_of_bins == 1u
45 ? bin_bits
46 : static_cast<size_t>(std::ceil(bin_bits * data.fpr_correction[number_of_bins]))};
47
48 size_t const ibf_size = ibf_node.number_of_technical_bins * bin_size;
49
50 if (current_hibf_level > 0 /* not top level */)
51 update_parent_kmers(parent_kmers, kmers);
52
53 return ibf_size;
54}
size_t compute_ibf_size(robin_hood::unordered_flat_set< uint64_t > &parent_kmers, robin_hood::unordered_flat_set< uint64_t > &kmers, size_t const number_of_bins, seqan::hibf::layout::graph::node const &ibf_node, seqan::hibf::build::build_data &data, size_t const current_hibf_level)
Definition: compute_ibf_size.hpp:26
void update_parent_kmers(robin_hood::unordered_flat_set< uint64_t > &parent_kmers, robin_hood::unordered_flat_set< uint64_t > const &kmers)
Definition: compute_ibf_size.hpp:18