Chopper
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
hibf_statistics.hpp
Go to the documentation of this file.
1// ---------------------------------------------------------------------------------------------------
2// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin
3// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik
4// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5// shipped with this file and also available at: https://github.com/seqan/chopper/blob/main/LICENSE.md
6// ---------------------------------------------------------------------------------------------------
7
8#pragma once
9
10#include <algorithm>
11#include <cassert>
12#include <cstddef>
13#include <iosfwd>
14#include <map>
15#include <numeric>
16#include <string>
17#include <typeindex>
18#include <vector>
19
21
22#include <hibf/layout/layout.hpp>
23#include <hibf/sketch/hyperloglog.hpp>
24
25namespace std
26{
27
28template <>
29struct hash<std::vector<size_t>>
30{
31 size_t operator()(std::vector<size_t> const & vec) const
32 {
33 return std::accumulate(vec.begin(),
34 vec.end(),
35 vec.size(),
36 [](std::size_t hash, size_t value)
37 {
38 return hash ^= value + 0x9e3779b9ULL + (hash << 6) + (hash >> 2);
39 });
40 }
41};
42
43} // namespace std
44
45namespace chopper::layout
46{
47
49{
50public:
51 hibf_statistics() = delete;
52 hibf_statistics(hibf_statistics const & b) = delete;
56 ~hibf_statistics() = default;
57
63 hibf_statistics(configuration const & config_,
64 std::vector<seqan::hibf::sketch::hyperloglog> const & sketches_,
65 std::vector<size_t> const & kmer_counts);
66
68 class bin;
69
71 struct level
72 {
74 std::vector<bin> bins;
75
77 double current_query_cost{0.0};
78 };
79
81 enum class bin_kind
82 {
83 split,
84 merged
85 };
86
88 void finalize();
89
91 static void print_header_to(std::ostream & stream, bool const verbose = true);
92
94 void print_summary_to(size_t & t_max_64_memory, std::ostream & stream, bool const verbose = true);
95
98
100 [[nodiscard]] static std::string byte_size_to_formatted_str(size_t const bytes);
101
104
106 double total_query_cost{0.0};
107
110
112 seqan::hibf::layout::layout hibf_layout;
113
114private:
117
119 std::vector<double> const fp_correction{};
120
123
125 std::vector<seqan::hibf::sketch::hyperloglog> const & sketches;
126
128 std::vector<size_t> const & counts;
129
131 size_t const total_kmer_count{};
132
134 struct level_summary;
135
137 std::map<size_t, level_summary> summaries;
138
143 std::string to_formatted_BF_size(size_t const number_of_kmers_to_be_stored) const;
144
145 void collect_bins();
146
147 void compute_cardinalities(level & curr_level);
148
150 void compute_total_query_cost(level & curr_level);
151
156 void gather_statistics(level const & curr_level, size_t const level_summary_index);
157};
158
160{
161public:
163 size_t cardinality;
166 std::vector<size_t> user_bin_indices;
167 size_t tb_index; // The (first) technical bin idx this bin is stored in.
170
171 bin() = default;
172 bin(bin const & b) = default;
173 bin & operator=(bin const &) = default;
174 bin(bin && b) = default;
175 bin & operator=(bin &&) = default;
176 ~bin() = default;
177
178 bin(bin_kind const kind_, size_t const spanning_tbs, std::vector<size_t> const & user_bin_indices_) :
179 kind{kind_},
180 num_contained_ubs{user_bin_indices_.size()},
181 num_spanning_tbs{spanning_tbs},
182 user_bin_indices{user_bin_indices_}
183 {
184 assert((kind == bin_kind::split && num_contained_ubs == 1u)
185 || (kind == bin_kind::merged && num_spanning_tbs == 1u));
186 }
187};
188
190{
191 size_t num_ibfs{};
192
193 std::vector<size_t> num_tbs{};
194 std::vector<size_t> num_ubs{};
195
196 std::vector<size_t> num_split_tbs{};
197 std::vector<size_t> num_merged_tbs{};
198
199 std::vector<size_t> num_split_ubs{};
200 std::vector<size_t> num_merged_ubs{};
201
202 std::vector<size_t> max_split_tb_span{};
203 std::vector<size_t> split_tb_corr_kmers{};
204 std::vector<size_t> split_tb_kmers{};
205
206 std::vector<size_t> max_ubs_in_merged{};
207
208 std::vector<size_t> ibf_mem_size{};
209 std::vector<size_t> ibf_mem_size_no_corr{};
210};
211
212} // namespace chopper::layout
Definition: hibf_statistics.hpp:160
std::vector< size_t > user_bin_indices
The user bin indices of this bin.
Definition: hibf_statistics.hpp:166
size_t child_level_idx
[MERGED] The lower level ibf statistics.
Definition: hibf_statistics.hpp:169
bin(bin &&b)=default
Defaulted.
bin & operator=(bin const &)=default
Defaulted.
size_t cardinality
The size/weight of the bin (either a kmer count or hll sketch estimation).
Definition: hibf_statistics.hpp:163
level child_level
[MERGED] The lower level ibf statistics.
Definition: hibf_statistics.hpp:168
bin(bin_kind const kind_, size_t const spanning_tbs, std::vector< size_t > const &user_bin_indices_)
Definition: hibf_statistics.hpp:178
size_t num_contained_ubs
[MERGED] How many UBs are merged within this TB.
Definition: hibf_statistics.hpp:164
size_t num_spanning_tbs
[SPLIT] How many TBs are used for this sindle UB.
Definition: hibf_statistics.hpp:165
bin & operator=(bin &&)=default
Defaulted.
bin_kind kind
Either a split or merged bin.
Definition: hibf_statistics.hpp:162
bin(bin const &b)=default
Defaulted.
size_t tb_index
Definition: hibf_statistics.hpp:167
Definition: hibf_statistics.hpp:49
static std::string byte_size_to_formatted_str(size_t const bytes)
Round bytes to the appropriate unit and convert to string with unit.
Definition: hibf_statistics.cpp:276
hibf_statistics()=delete
Deleted. Holds reference members.
size_t total_hibf_size_in_byte()
Return the total corrected size of the HIBF in bytes.
Definition: hibf_statistics.cpp:252
double expected_HIBF_query_cost
The estimated query cost relative to the total k-mer count in the data set.
Definition: hibf_statistics.hpp:109
void compute_cardinalities(level &curr_level)
Definition: hibf_statistics.cpp:437
void compute_total_query_cost(level &curr_level)
Computes the estimated query cost.
Definition: hibf_statistics.cpp:471
double total_query_cost
The estimated query cost of every single kmer in this HIBF.
Definition: hibf_statistics.hpp:106
std::map< size_t, level_summary > summaries
The gathered summary of statistics for each level of this HIBF.
Definition: hibf_statistics.hpp:137
bin_kind
The kind of bin that is stored.
Definition: hibf_statistics.hpp:82
@ merged
Multiple user bins are merged into a single technical bin.
@ split
A single user bin, split into 1 or more bins (even though 1 is not technically split).
std::vector< size_t > const & counts
A reference to the input counts.
Definition: hibf_statistics.hpp:128
double const merged_fpr_correction_factor
The merged bin false positive correction factors to use for the statistics.
Definition: hibf_statistics.hpp:122
hibf_statistics(hibf_statistics const &b)=delete
Deleted. Holds const member.
std::vector< double > const fp_correction
The split bin false positive correction factors to use for the statistics.
Definition: hibf_statistics.hpp:119
hibf_statistics(hibf_statistics &&b)=delete
Deleted. Holds const member.
void gather_statistics(level const &curr_level, size_t const level_summary_index)
Recursively gather all the statistics from the bins.
Definition: hibf_statistics.cpp:550
level top_level_ibf
The top level IBF of this HIBF, often starting point for recursions.
Definition: hibf_statistics.hpp:103
seqan::hibf::layout::layout hibf_layout
A reference to the input counts.
Definition: hibf_statistics.hpp:112
static void print_header_to(std::ostream &stream, bool const verbose=true)
Prints a column names of the summary to the command line.
Definition: hibf_statistics.cpp:77
void finalize()
Gather all statistics to have all members ready.
Definition: hibf_statistics.cpp:63
std::string to_formatted_BF_size(size_t const number_of_kmers_to_be_stored) const
Compute the Bloom Filter size from number_of_kmers_to_be_stored and return it as a formatted string w...
Definition: hibf_statistics.cpp:354
std::vector< seqan::hibf::sketch::hyperloglog > const & sketches
A reference to the input sketches.
Definition: hibf_statistics.hpp:125
void collect_bins()
Definition: hibf_statistics.cpp:363
void print_summary_to(size_t &t_max_64_memory, std::ostream &stream, bool const verbose=true)
Prints a tab-separated summary of the statistics of this HIBF to the command line.
Definition: hibf_statistics.cpp:108
hibf_statistics & operator=(hibf_statistics const &)=delete
Deleted. Holds const member.
~hibf_statistics()=default
Defaulted.
hibf_statistics & operator=(hibf_statistics &&)=delete
Deleted. Holds const member.
size_t const total_kmer_count
The original kmer count of all user bins.
Definition: hibf_statistics.hpp:131
Definition: determine_best_number_of_technical_bins.hpp:17
Definition: hibf_statistics.hpp:26
Definition: configuration.hpp:24
Definition: hibf_statistics.hpp:190
std::vector< size_t > num_tbs
Definition: hibf_statistics.hpp:193
std::vector< size_t > max_split_tb_span
Definition: hibf_statistics.hpp:202
std::vector< size_t > ibf_mem_size_no_corr
Definition: hibf_statistics.hpp:209
std::vector< size_t > num_merged_tbs
Definition: hibf_statistics.hpp:197
std::vector< size_t > max_ubs_in_merged
Definition: hibf_statistics.hpp:206
std::vector< size_t > num_ubs
Definition: hibf_statistics.hpp:194
std::vector< size_t > split_tb_corr_kmers
Definition: hibf_statistics.hpp:203
std::vector< size_t > ibf_mem_size
Definition: hibf_statistics.hpp:208
std::vector< size_t > split_tb_kmers
Definition: hibf_statistics.hpp:204
std::vector< size_t > num_split_ubs
Definition: hibf_statistics.hpp:199
std::vector< size_t > num_merged_ubs
Definition: hibf_statistics.hpp:200
std::vector< size_t > num_split_tbs
Definition: hibf_statistics.hpp:196
size_t num_ibfs
Definition: hibf_statistics.hpp:191
A representation of an IBF level that gathers information about bins in an IBF.
Definition: hibf_statistics.hpp:72
std::vector< bin > bins
The bins of the current IBF level. May be split or merged bins.
Definition: hibf_statistics.hpp:74
double current_query_cost
The query cost to arrive at this IBF (updated before backtracking respective DP).
Definition: hibf_statistics.hpp:77
Definition: shared.hpp:19
size_t operator()(std::vector< size_t > const &vec) const
Definition: hibf_statistics.hpp:31