22#include <hibf/layout/layout.hpp>
23#include <hibf/sketch/hyperloglog.hpp>
29struct hash<
std::vector<size_t>>
33 return std::accumulate(vec.begin(),
36 [](std::size_t hash,
size_t value)
38 return hash ^= value + 0x9e3779b9ULL + (hash << 6) + (hash >> 2);
64 std::vector<seqan::hibf::sketch::hyperloglog>
const & sketches_,
65 std::vector<size_t>
const & kmer_counts);
91 static void print_header_to(std::ostream & stream,
bool const verbose =
true);
94 void print_summary_to(
size_t & t_max_64_memory, std::ostream & stream,
bool const verbose =
true);
125 std::vector<seqan::hibf::sketch::hyperloglog>
const &
sketches;
134 struct level_summary;
178 bin(
bin_kind const kind_,
size_t const spanning_tbs, std::vector<size_t>
const & user_bin_indices_) :
Definition: hibf_statistics.hpp:160
std::vector< size_t > user_bin_indices
The user bin indices of this bin.
Definition: hibf_statistics.hpp:166
size_t child_level_idx
[MERGED] The lower level ibf statistics.
Definition: hibf_statistics.hpp:169
bin(bin &&b)=default
Defaulted.
bin & operator=(bin const &)=default
Defaulted.
size_t cardinality
The size/weight of the bin (either a kmer count or hll sketch estimation).
Definition: hibf_statistics.hpp:163
level child_level
[MERGED] The lower level ibf statistics.
Definition: hibf_statistics.hpp:168
bin(bin_kind const kind_, size_t const spanning_tbs, std::vector< size_t > const &user_bin_indices_)
Definition: hibf_statistics.hpp:178
size_t num_contained_ubs
[MERGED] How many UBs are merged within this TB.
Definition: hibf_statistics.hpp:164
size_t num_spanning_tbs
[SPLIT] How many TBs are used for this sindle UB.
Definition: hibf_statistics.hpp:165
bin & operator=(bin &&)=default
Defaulted.
bin_kind kind
Either a split or merged bin.
Definition: hibf_statistics.hpp:162
bin(bin const &b)=default
Defaulted.
size_t tb_index
Definition: hibf_statistics.hpp:167
Definition: hibf_statistics.hpp:49
static std::string byte_size_to_formatted_str(size_t const bytes)
Round bytes to the appropriate unit and convert to string with unit.
Definition: hibf_statistics.cpp:276
hibf_statistics()=delete
Deleted. Holds reference members.
size_t total_hibf_size_in_byte()
Return the total corrected size of the HIBF in bytes.
Definition: hibf_statistics.cpp:252
double expected_HIBF_query_cost
The estimated query cost relative to the total k-mer count in the data set.
Definition: hibf_statistics.hpp:109
void compute_cardinalities(level &curr_level)
Definition: hibf_statistics.cpp:437
void compute_total_query_cost(level &curr_level)
Computes the estimated query cost.
Definition: hibf_statistics.cpp:471
double total_query_cost
The estimated query cost of every single kmer in this HIBF.
Definition: hibf_statistics.hpp:106
std::map< size_t, level_summary > summaries
The gathered summary of statistics for each level of this HIBF.
Definition: hibf_statistics.hpp:137
bin_kind
The kind of bin that is stored.
Definition: hibf_statistics.hpp:82
@ merged
Multiple user bins are merged into a single technical bin.
@ split
A single user bin, split into 1 or more bins (even though 1 is not technically split).
std::vector< size_t > const & counts
A reference to the input counts.
Definition: hibf_statistics.hpp:128
double const merged_fpr_correction_factor
The merged bin false positive correction factors to use for the statistics.
Definition: hibf_statistics.hpp:122
hibf_statistics(hibf_statistics const &b)=delete
Deleted. Holds const member.
std::vector< double > const fp_correction
The split bin false positive correction factors to use for the statistics.
Definition: hibf_statistics.hpp:119
hibf_statistics(hibf_statistics &&b)=delete
Deleted. Holds const member.
void gather_statistics(level const &curr_level, size_t const level_summary_index)
Recursively gather all the statistics from the bins.
Definition: hibf_statistics.cpp:550
level top_level_ibf
The top level IBF of this HIBF, often starting point for recursions.
Definition: hibf_statistics.hpp:103
seqan::hibf::layout::layout hibf_layout
A reference to the input counts.
Definition: hibf_statistics.hpp:112
static void print_header_to(std::ostream &stream, bool const verbose=true)
Prints a column names of the summary to the command line.
Definition: hibf_statistics.cpp:77
void finalize()
Gather all statistics to have all members ready.
Definition: hibf_statistics.cpp:63
std::string to_formatted_BF_size(size_t const number_of_kmers_to_be_stored) const
Compute the Bloom Filter size from number_of_kmers_to_be_stored and return it as a formatted string w...
Definition: hibf_statistics.cpp:354
std::vector< seqan::hibf::sketch::hyperloglog > const & sketches
A reference to the input sketches.
Definition: hibf_statistics.hpp:125
void collect_bins()
Definition: hibf_statistics.cpp:363
void print_summary_to(size_t &t_max_64_memory, std::ostream &stream, bool const verbose=true)
Prints a tab-separated summary of the statistics of this HIBF to the command line.
Definition: hibf_statistics.cpp:108
hibf_statistics & operator=(hibf_statistics const &)=delete
Deleted. Holds const member.
~hibf_statistics()=default
Defaulted.
hibf_statistics & operator=(hibf_statistics &&)=delete
Deleted. Holds const member.
size_t const total_kmer_count
The original kmer count of all user bins.
Definition: hibf_statistics.hpp:131
Definition: determine_best_number_of_technical_bins.hpp:17
Definition: hibf_statistics.hpp:26
Definition: configuration.hpp:24
Definition: hibf_statistics.hpp:190
std::vector< size_t > num_tbs
Definition: hibf_statistics.hpp:193
std::vector< size_t > max_split_tb_span
Definition: hibf_statistics.hpp:202
std::vector< size_t > ibf_mem_size_no_corr
Definition: hibf_statistics.hpp:209
std::vector< size_t > num_merged_tbs
Definition: hibf_statistics.hpp:197
std::vector< size_t > max_ubs_in_merged
Definition: hibf_statistics.hpp:206
std::vector< size_t > num_ubs
Definition: hibf_statistics.hpp:194
std::vector< size_t > split_tb_corr_kmers
Definition: hibf_statistics.hpp:203
std::vector< size_t > ibf_mem_size
Definition: hibf_statistics.hpp:208
std::vector< size_t > split_tb_kmers
Definition: hibf_statistics.hpp:204
std::vector< size_t > num_split_ubs
Definition: hibf_statistics.hpp:199
std::vector< size_t > num_merged_ubs
Definition: hibf_statistics.hpp:200
std::vector< size_t > num_split_tbs
Definition: hibf_statistics.hpp:196
size_t num_ibfs
Definition: hibf_statistics.hpp:191
A representation of an IBF level that gathers information about bins in an IBF.
Definition: hibf_statistics.hpp:72
std::vector< bin > bins
The bins of the current IBF level. May be split or merged bins.
Definition: hibf_statistics.hpp:74
double current_query_cost
The query cost to arrive at this IBF (updated before backtracking respective DP).
Definition: hibf_statistics.hpp:77
Definition: shared.hpp:19
size_t operator()(std::vector< size_t > const &vec) const
Definition: hibf_statistics.hpp:31