Raptor
A fast and space-efficient pre-filter
All Classes Namespaces Files Functions Variables Macros Pages Concepts
search_singular_ibf.hpp
Go to the documentation of this file.
1// SPDX-FileCopyrightText: 2006-2024 Knut Reinert & Freie Universität Berlin
2// SPDX-FileCopyrightText: 2016-2024 Knut Reinert & MPI für molekulare Genetik
3// SPDX-License-Identifier: BSD-3-Clause
4
10#pragma once
11
12#include <future>
13#include <random>
14
15#include <seqan3/search/views/minimiser_hash.hpp>
16
17#include <hibf/contrib/std/chunk_view.hpp>
18
25
26namespace raptor
27{
28
29template <typename index_t>
30void search_singular_ibf(search_arguments const & arguments, index_t && index)
31{
33
34 auto cereal_future = std::async(std::launch::async,
35 [&]()
36 {
37 load_index(index, arguments);
38 });
39
40 seqan3::sequence_file_input<dna4_traits, seqan3::fields<seqan3::field::id, seqan3::field::seq>> fin{
41 arguments.query_file};
42 using record_type = typename decltype(fin)::record_type;
44
45 sync_out synced_out{arguments};
46
47 raptor::threshold::threshold const thresholder{arguments.make_threshold_parameters()};
48
49 auto worker = [&](size_t const start, size_t const extent)
50 {
51 seqan::hibf::serial_timer local_compute_minimiser_timer{};
52 seqan::hibf::serial_timer local_query_ibf_timer{};
53 seqan::hibf::serial_timer local_generate_results_timer{};
54
55#if defined(__clang__)
56 auto counter = [&index]()
57#else
58 auto counter = [&index, is_ibf]()
59#endif
60 {
61 if constexpr (is_ibf)
62 return index.ibf().template counting_agent<uint16_t>();
63 else
64 return index.ibf().membership_agent();
65 }();
66 std::string result_string{};
67 std::vector<uint64_t> minimiser;
68
69 auto hash_adaptor = seqan3::views::minimiser_hash(arguments.shape,
70 seqan3::window_size{arguments.window_size},
71 seqan3::seed{adjust_seed(arguments.shape_weight)});
72
73 for (auto && [id, seq] : std::span{records.data() + start, extent})
74 {
75 result_string.clear();
76 result_string += id;
77 result_string += '\t';
78
79 auto minimiser_view = seq | hash_adaptor | std::views::common;
80 local_compute_minimiser_timer.start();
81 minimiser.assign(minimiser_view.begin(), minimiser_view.end());
82 local_compute_minimiser_timer.stop();
83
84 size_t const minimiser_count{minimiser.size()};
85 size_t const threshold = thresholder.get(minimiser_count);
86
87 if constexpr (is_ibf)
88 {
89 local_query_ibf_timer.start();
90 auto & result = counter.bulk_count(minimiser);
91 local_query_ibf_timer.stop();
92 size_t current_bin{0};
93 local_generate_results_timer.start();
94 for (auto && count : result)
95 {
96 if (count >= threshold)
97 {
98 result_string += std::to_string(current_bin);
99 result_string += ',';
100 }
101 ++current_bin;
102 }
103 }
104 else
105 {
106 local_query_ibf_timer.start();
107 auto & result = counter.membership_for(minimiser, threshold); // Results contains user bin IDs
108 local_query_ibf_timer.stop();
109 local_generate_results_timer.start();
110 for (auto && count : result)
111 {
112 result_string += std::to_string(count);
113 result_string += ',';
114 }
115 }
116
117 if (auto & last_char = result_string.back(); last_char == ',')
118 last_char = '\n';
119 else
120 result_string += '\n';
121
122 synced_out.write(result_string);
123 local_generate_results_timer.stop();
124 }
125
126 arguments.compute_minimiser_timer += local_compute_minimiser_timer;
127 arguments.query_ibf_timer += local_query_ibf_timer;
128 arguments.generate_results_timer += local_generate_results_timer;
129 };
130
131 auto write_header = [&]()
132 {
133 if constexpr (is_ibf)
134 return synced_out.write_header(arguments, index.ibf().hash_function_count());
135 else
136 return synced_out.write_header(arguments, index.ibf().ibf_vector[0].hash_function_count());
137 };
138
139 for (auto && chunked_records : fin | seqan::stl::views::chunk((1ULL << 20) * 10))
140 {
141 records.clear();
142 arguments.query_file_io_timer.start();
143 std::ranges::move(chunked_records, std::back_inserter(records));
144 // Very fast, improves parallel processing when chunks of the query belong to the same bin.
146 arguments.query_file_io_timer.stop();
147
148 cereal_future.get();
149 [[maybe_unused]] static bool header_written = write_header(); // called exactly once
150
151 arguments.parallel_search_timer.start();
152 do_parallel(worker, records.size(), arguments.threads);
153 arguments.parallel_search_timer.stop();
154 }
155}
156
157} // namespace raptor
Provides raptor::adjust_seed.
T async(T... args)
T back_inserter(T... args)
Definition threshold.hpp:19
T data(T... args)
Provides raptor::dna4_traits.
Provides raptor::do_parallel.
Provides raptor::load_index.
T min(T... args)
T move(T... args)
T shuffle(T... args)
Provides raptor::sync_out.
Provides raptor::threshold::threshold.
T to_string(T... args)
Hide me