SeqAn3 3.1.0
The Modern C++ library for sequence analysis.
search_scheme_algorithm.hpp
Go to the documentation of this file.
1// -----------------------------------------------------------------------------------------------------
2// Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin
3// Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik
4// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6// -----------------------------------------------------------------------------------------------------
7
13#pragma once
14
15#include <type_traits>
16
24
25namespace seqan3::detail
26{
27
34template <typename configuration_t, typename index_t, typename ...policies_t>
36 requires (template_specialisation_of<typename index_t::cursor_type, bi_fm_index_cursor>)
38class search_scheme_algorithm : protected policies_t...
39{
40private:
42 using traits_t = search_traits<configuration_t>;
44 using search_result_type = typename traits_t::search_result_type;
45
46 static_assert(!std::same_as<search_result_type, empty_type>, "The search result type was not configured.");
47
48public:
52 search_scheme_algorithm() = default;
53 search_scheme_algorithm(search_scheme_algorithm const &) = default;
54 search_scheme_algorithm(search_scheme_algorithm &&) = default;
55 search_scheme_algorithm & operator=(search_scheme_algorithm const &) = default;
56 search_scheme_algorithm & operator=(search_scheme_algorithm &&) = default;
57 ~search_scheme_algorithm() = default;
58
70 search_scheme_algorithm(configuration_t const & cfg, index_t const & index) : policies_t{cfg}...
71 {
72 stratum = cfg.get_or(search_cfg::hit_strata{0}).stratum;
73 index_ptr = std::addressof(index);
74 }
76
97 template <tuple_like indexed_query_t, typename callback_t>
99 requires (std::tuple_size_v<indexed_query_t> == 2) &&
100 std::ranges::forward_range<std::tuple_element_t<1, indexed_query_t>> &&
101 std::invocable<callback_t, search_result_type>
103 void operator()(indexed_query_t && indexed_query, callback_t && callback)
104 {
105 auto && [query_idx, query] = indexed_query;
106 auto error_state = this->max_error_counts(query); // see policy_max_error
107
108 // construct internal delegate for collecting hits for later filtering (if necessary)
110 auto on_hit_delegate = [&internal_hits] (auto const & it)
111 {
112 internal_hits.push_back(it);
113 };
114
115 perform_search_by_hit_strategy(internal_hits, query, error_state, on_hit_delegate);
116
117 // Invoke the callback on the generated result.
118 this->make_results(std::move(internal_hits), query_idx, callback); // see policy_search_result_builder
119 }
120
121private:
123 index_t const * index_ptr{nullptr};
124
126 uint8_t stratum{};
127
128 // forward declaration
129 template <bool abort_on_hit, typename query_t, typename delegate_t>
130 inline void search_algo_bi(query_t & query, search_param const error_left, delegate_t && delegate);
131
139 template <typename query_t, typename delegate_t>
140 void perform_search_by_hit_strategy(std::vector<typename index_t::cursor_type> & internal_hits,
141 query_t & query,
142 search_param error_state,
143 delegate_t const & on_hit_delegate)
144 {
145 if constexpr (!traits_t::search_all_hits)
146 {
147 auto max_total = error_state.total;
148 error_state.total = 0; // start search with less errors
149 while (internal_hits.empty() && error_state.total <= max_total)
150 {
151 // * If you only want the best hit (traits_t::search_single_best_hit), you stop after finding the
152 // first hit, the hit with the least errors (`abort_on_hit` is true).
153 // * If you are in strata mode (traits_t::search_strata_hits), you do the same as with best hits,
154 // but then do the extra step afterwards (`abort_on_hit` is true).
155 // * If you want all best hits (traits_t::search_all_best_hits), you do not stop after the first
156 // hit but continue the current search algorithm/max_error pattern (`abort_on_hit` is true).
157 constexpr bool abort_on_hit = !traits_t::search_all_best_hits;
158 search_algo_bi<abort_on_hit>(query, error_state, on_hit_delegate);
159 ++error_state.total;
160 }
161 if constexpr (traits_t::search_strata_hits)
162 {
163 if (!internal_hits.empty())
164 {
165 internal_hits.clear(); // TODO:don't clear when using Optimum Search Schemes with lower error bounds
166 error_state.total += stratum - 1;
167 search_algo_bi<false>(query, error_state, on_hit_delegate);
168 }
169 }
170 }
171 else // detail::search_mode_all
172 {
173 // If you want to find all hits, you cannot stop once you found any hit (<false>)
174 // since you have to find all paths in the search tree that satisfy the hit condition.
175 search_algo_bi<false>(query, error_state, on_hit_delegate);
176 }
177 }
178};
179
193inline std::vector<search_dyn> compute_ss(uint8_t const min_error, uint8_t const max_error)
194{
195 // TODO: Replace this at least by the pigeonhole principle or even better by 01*0 schemes.
196 // NOTE: Make sure that the searches are sorted by their asymptotical running time (i.e. upper error bound string),
197 // s.t. easy to compute searches come first. This improves the running time of algorithms that abort after the
198 // first hit (e.g. search strategy: best). Even though it is not guaranteed, this seems to be a good greedy
199 // approach.
200 std::vector<search_dyn> scheme{{{1}, {min_error}, {max_error}}};
201 return scheme;
202}
203
221template <typename search_scheme_t>
222inline auto search_scheme_block_info(search_scheme_t const & search_scheme, size_t const query_length)
223{
224 using blocks_length_type = typename search_scheme_t::value_type::blocks_length_type;
225
226 bool constexpr is_dyn_scheme = std::same_as<search_scheme_t, search_scheme_dyn_type>;
227
228 // Either store information in an array (for search schemes known at compile time) or in a vector otherwise.
229 using result_type = std::conditional_t<is_dyn_scheme,
232 transformation_trait_or_t<std::tuple_size<search_scheme_t>,
233 std::false_type>::value>>;
234
235 result_type result;
236 if constexpr (is_dyn_scheme)
237 result.resize(search_scheme.size());
238
239 uint8_t const blocks {search_scheme[0].blocks()};
240 size_t const block_length{query_length / blocks};
241 uint8_t const rest {static_cast<uint8_t>(query_length % blocks)};
242
243 blocks_length_type blocks_length;
244 // set all blocks_length values to block_length
245 // resp. block_length + 1 for the first `rest = block_length % blocks` values
246 if constexpr (is_dyn_scheme)
247 blocks_length.resize(blocks, block_length);
248 else
249 blocks_length.fill(block_length);
250
251 for (uint8_t block_id = 0; block_id < rest; ++block_id)
252 ++blocks_length[block_id];
253
254 for (uint8_t search_id = 0; search_id < search_scheme.size(); ++search_id)
255 {
256 auto const & search = search_scheme[search_id];
257
258 auto & [search_blocks_length, start_pos] = result[search_id];
259
260 // compute cumulative blocks_length and starting position
261 start_pos = 0;
262 if constexpr (is_dyn_scheme)
263 search_blocks_length.resize(blocks);
264 search_blocks_length[0] = blocks_length[search.pi[0] - 1];
265 for (uint8_t i = 1; i < blocks; ++i)
266 {
267 search_blocks_length[i] = blocks_length[search.pi[i] - 1] + search_blocks_length[i - 1];
268 if (search.pi[i] < search.pi[0])
269 start_pos += search_blocks_length[i] - search_blocks_length[i - 1];
270 }
271 }
272
273 return result;
274}
275
277// forward declaration
278template <bool abort_on_hit, typename cursor_t, typename query_t, typename search_t, typename blocks_length_t,
279 typename delegate_t>
280inline bool search_ss(cursor_t cur, query_t & query,
281 typename cursor_t::size_type const lb, typename cursor_t::size_type const rb,
282 uint8_t const errors_spent, uint8_t const block_id, bool const go_right, search_t const & search,
283 blocks_length_t const & blocks_length, search_param const error_left, delegate_t && delegate);
285
317template <bool abort_on_hit, typename cursor_t, typename query_t, typename search_t, typename blocks_length_t,
318 typename delegate_t>
319inline bool search_ss_exact(cursor_t cur, query_t & query,
320 typename cursor_t::size_type const lb, typename cursor_t::size_type const rb,
321 uint8_t const errors_spent, uint8_t const block_id, bool const go_right,
322 search_t const & search, blocks_length_t const & blocks_length,
323 search_param const error_left, delegate_t && delegate)
324{
325 using size_type = typename cursor_t::size_type;
326
327 uint8_t const block_id2 = std::min<uint8_t>(block_id + 1, search.blocks() - 1);
328 bool const go_right2 = (block_id < search.blocks() - 1) && (search.pi[block_id + 1] > search.pi[block_id]);
329
330 if (go_right)
331 {
332 size_type const infix_lb = rb - 1; // inclusive
333 size_type const infix_rb = lb + blocks_length[block_id] - 1; // exclusive
334
335 if (!cur.extend_right(query | views::slice(infix_lb, infix_rb + 1)))
336 return false;
337
338 if (search_ss<abort_on_hit>(cur, query, lb, infix_rb + 2, errors_spent, block_id2, go_right2, search,
339 blocks_length, error_left, delegate) && abort_on_hit)
340 {
341 return true;
342 }
343 }
344 else
345 {
346 size_type const infix_lb = rb - blocks_length[block_id] - 1; // inclusive
347 size_type const infix_rb = lb - 1; // inclusive
348
349 if (!cur.extend_left(query | views::slice(infix_lb, infix_rb + 1)))
350 return false;
351
352 if (search_ss<abort_on_hit>(cur, query, infix_lb, rb, errors_spent, block_id2, go_right2, search, blocks_length,
353 error_left, delegate) && abort_on_hit)
354 {
355 return true;
356 }
357 }
358 return false;
359}
360
367template <bool abort_on_hit, typename cursor_t, typename query_t, typename search_t, typename blocks_length_t,
368 typename delegate_t>
369inline bool search_ss_deletion(cursor_t cur, query_t & query,
370 typename cursor_t::size_type const lb, typename cursor_t::size_type const rb,
371 uint8_t const errors_spent, uint8_t const block_id, bool const go_right,
372 search_t const & search, blocks_length_t const & blocks_length,
373 search_param const error_left, delegate_t && delegate)
374{
375 uint8_t const max_error_left_in_block = search.u[block_id] - errors_spent;
376 uint8_t const min_error_left_in_block = std::max(search.l[block_id] - errors_spent, 0);
377
378 // Switch to the next block when the min number of errors is reached
379 if (min_error_left_in_block == 0)
380 {
381 uint8_t const block_id2 = std::min<uint8_t>(block_id + 1, search.blocks() - 1);
382 bool const go_right2 = block_id2 == 0 ? true : search.pi[block_id2] > search.pi[block_id2 - 1];
383
384 if (search_ss<abort_on_hit>(cur, query, lb, rb, errors_spent, block_id2, go_right2, search, blocks_length,
385 error_left, delegate) && abort_on_hit)
386 {
387 return true;
388 }
389 }
390
391 // Insert deletions into the current block as long as possible
392 // Do not allow deletions at the beginning of the leftmost block
393 // Do not allow deletions at the end of the rightmost block
394 if (!(search.pi[block_id] == 1 && !go_right) &&
395 !(search.pi[block_id] == search.blocks() && go_right) &&
396 max_error_left_in_block > 0 && error_left.total > 0 && error_left.deletion > 0 &&
397 ((go_right && cur.extend_right()) || (!go_right && cur.extend_left())))
398 {
399 search_param error_left2{error_left};
400 error_left2.total--;
401 error_left2.deletion--;
402 do
403 {
404 if (search_ss_deletion<abort_on_hit>(cur, query, lb, rb, errors_spent + 1, block_id, go_right, search,
405 blocks_length, error_left2, delegate) && abort_on_hit)
406 {
407 return true;
408 }
409 } while ((go_right && cur.cycle_back()) || (!go_right && cur.cycle_front()));
410 }
411 return false;
412}
413
422template <bool abort_on_hit, typename cursor_t, typename query_t, typename search_t, typename blocks_length_t,
423 typename delegate_t>
424inline bool search_ss_children(cursor_t cur, query_t & query,
425 typename cursor_t::size_type const lb, typename cursor_t::size_type const rb,
426 uint8_t const errors_spent, uint8_t const block_id, bool const go_right,
427 uint8_t const min_error_left_in_block, search_t const & search,
428 blocks_length_t const & blocks_length, search_param const error_left,
429 delegate_t && delegate)
430{
431 using size_type = typename cursor_t::size_type;
432 if ((go_right && cur.extend_right()) || (!go_right && cur.extend_left()))
433 {
434 size_type const chars_left = blocks_length[block_id] - (rb - lb - 1);
435
436 size_type lb2 = lb - !go_right;
437 size_type rb2 = rb + go_right;
438
439 do
440 {
441 bool const delta = cur.last_rank() != to_rank(query[(go_right ? rb : lb) - 1]);
442
443 // skip if there are more min errors left in the current block than characters in the block
444 // i.e. chars_left - 1 < min_error_left_in_block - delta
445 // TODO: move that outside the if / do-while struct
446 // TODO: incorporate error_left.deletion into formula
447 if (error_left.deletion == 0 && chars_left + delta < min_error_left_in_block + 1u)
448 continue;
449
450 if (!delta || error_left.substitution > 0)
451 {
452 search_param error_left2{error_left};
453 error_left2.total -= delta;
454 error_left2.substitution -= delta;
455
456 // At the end of the current block
457 if (rb - lb == blocks_length[block_id])
458 {
459 // Leave the possibility for one or multiple deletions at the end of a block.
460 // Thus do not change the direction (go_right) yet.
461 if (error_left.deletion > 0)
462 {
463 if (search_ss_deletion<abort_on_hit>(cur, query, lb2, rb2, errors_spent + delta, block_id,
464 go_right, search, blocks_length, error_left2, delegate) &&
465 abort_on_hit)
466 {
467 return true;
468 }
469 }
470 else
471 {
472 uint8_t const block_id2 = std::min<uint8_t>(block_id + 1, search.blocks() - 1);
473 bool const go_right2 = block_id2 == 0 ? true : search.pi[block_id2] > search.pi[block_id2 - 1];
474
475 if (search_ss<abort_on_hit>(cur, query, lb2, rb2, errors_spent + delta, block_id2, go_right2,
476 search, blocks_length, error_left2, delegate) &&
477 abort_on_hit)
478 {
479 return true;
480 }
481 }
482 }
483 else
484 {
485 if (search_ss<abort_on_hit>(cur, query, lb2, rb2, errors_spent + delta, block_id, go_right, search,
486 blocks_length, error_left2, delegate) && abort_on_hit)
487 {
488 return true;
489 }
490 }
491 }
492
493 // Deletion
494 // TODO: check whether the conditions for deletions at the beginning/end of the query are really necessary
495 // No deletion at the beginning of the leftmost block.
496 // No deletion at the end of the rightmost block.
497 if (error_left.deletion > 0 &&
498 !(go_right && (rb == 1 || rb == std::ranges::size(query) + 1)) &&
499 !(!go_right && (lb == 0 || lb == std::ranges::size(query))))
500 {
501 search_param error_left3{error_left};
502 error_left3.total--;
503 error_left3.deletion--;
504 search_ss<abort_on_hit>(cur, query, lb, rb, errors_spent + 1, block_id, go_right, search, blocks_length,
505 error_left3, delegate);
506 }
507 } while ((go_right && cur.cycle_back()) || (!go_right && cur.cycle_front()));
508 }
509 return false;
510}
511
517template <bool abort_on_hit, typename cursor_t, typename query_t, typename search_t,
518 typename blocks_length_t, typename delegate_t>
519inline bool search_ss(cursor_t cur, query_t & query,
520 typename cursor_t::size_type const lb, typename cursor_t::size_type const rb,
521 uint8_t const errors_spent, uint8_t const block_id, bool const go_right, search_t const & search,
522 blocks_length_t const & blocks_length, search_param const error_left, delegate_t && delegate)
523{
524 uint8_t const max_error_left_in_block = search.u[block_id] - errors_spent;
525 uint8_t const min_error_left_in_block = std::max(search.l[block_id] - errors_spent, 0); // NOTE: changed
526
527 // Done.
528 if (min_error_left_in_block == 0 && lb == 0 && rb == std::ranges::size(query) + 1)
529 {
530 delegate(cur);
531 return true;
532 }
533 // Exact search in current block.
534 else if (((max_error_left_in_block == 0) && (rb - lb - 1 != blocks_length[block_id])) ||
535 (error_left.total == 0 && min_error_left_in_block == 0))
536 {
537 if (search_ss_exact<abort_on_hit>(cur, query, lb, rb, errors_spent, block_id, go_right, search, blocks_length,
538 error_left, delegate) && abort_on_hit)
539 {
540 return true;
541 }
542 }
543 // Approximate search in current block.
544 // i.e. blocks_length[block_id] - (rb - lb - (lb != rb)) >= min_error_left_in_block
545 else if (error_left.total > 0)
546 {
547 // Insertion
548 if (error_left.insertion > 0)
549 {
550 using size_type = typename cursor_t::size_type;
551
552 size_type const lb2 = lb - !go_right;
553 size_type const rb2 = rb + go_right;
554
555 search_param error_left2{error_left};
556 error_left2.total--;
557 error_left2.insertion--;
558 // At the end of the current block
559 if (rb - lb == blocks_length[block_id])
560 {
561 // Leave the possibility for one or multiple deletions at the end of a block.
562 // Thus do not change the direction (go_right) yet.
563 // TODO: benchmark the improvement on preventing insertions followed by a deletion and vice versa. Does
564 // it pay off the additional complexity and documentation for the user? (Note that the user might only
565 // allow for insertions and deletion and not for mismatches).
566 if (search_ss_deletion<abort_on_hit>(cur, query, lb2, rb2, errors_spent + 1, block_id, go_right, search,
567 blocks_length, error_left2, delegate) && abort_on_hit)
568 {
569 return true;
570 }
571 }
572 else
573 {
574 if (search_ss<abort_on_hit>(cur, query, lb2, rb2, errors_spent + 1, block_id, go_right, search,
575 blocks_length, error_left2, delegate) && abort_on_hit)
576 {
577 return true;
578 }
579 }
580 }
581 if (search_ss_children<abort_on_hit>(cur, query, lb, rb, errors_spent, block_id, go_right,
582 min_error_left_in_block, search, blocks_length, error_left, delegate) &&
583 abort_on_hit)
584 {
585 return true;
586 }
587 }
588 return false;
589}
590
614template <bool abort_on_hit, typename index_t, typename query_t, typename search_scheme_t, typename delegate_t>
615inline void search_ss(index_t const & index, query_t & query, search_param const error_left,
616 search_scheme_t const & search_scheme, delegate_t && delegate)
617{
618 // retrieve cumulative block lengths and starting position
619 auto const block_info = search_scheme_block_info(search_scheme, std::ranges::size(query));
620
621 for (uint8_t search_id = 0; search_id < search_scheme.size(); ++search_id)
622 {
623 auto const & search = search_scheme[search_id];
624 auto const & [blocks_length, start_pos] = block_info[search_id];
625
626 bool const hit = search_ss<abort_on_hit>(
627 index.cursor(), // cursor on the index
628 query, // query to be searched
629 start_pos, start_pos + 1, // infix range already searched (open interval)
630 // the first character of `query` has the index 1 (not 0)
631 0, // errors spent
632 0, // current block id in search scheme
633 true, // search the first block from left to right
634 search, blocks_length, // search scheme information
635 error_left, // errors left (broken down by error types)
636 delegate // delegate function called on hit
637 );
638
639 if (abort_on_hit && hit)
640 return;
641 }
642}
643
662template <typename configuration_t, typename index_t, typename ...policies_t>
663template <bool abort_on_hit, typename query_t, typename delegate_t>
664inline void search_scheme_algorithm<configuration_t, index_t, policies_t...>::search_algo_bi(
665 query_t & query,
666 search_param const error_left,
667 delegate_t && delegate)
668{
669 switch (error_left.total)
670 {
671 case 0:
672 search_ss<abort_on_hit>(*index_ptr, query, error_left, optimum_search_scheme<0, 0>, delegate);
673 break;
674 case 1:
675 search_ss<abort_on_hit>(*index_ptr, query, error_left, optimum_search_scheme<0, 1>, delegate);
676 break;
677 case 2:
678 search_ss<abort_on_hit>(*index_ptr, query, error_left, optimum_search_scheme<0, 2>, delegate);
679 break;
680 case 3:
681 search_ss<abort_on_hit>(*index_ptr, query, error_left, optimum_search_scheme<0, 3>, delegate);
682 break;
683 default:
684 auto const & search_scheme{compute_ss(0, error_left.total)};
685 search_ss<abort_on_hit>(*index_ptr, query, error_left, search_scheme, delegate);
686 break;
687 }
688}
689
690} // namespace seqan3::detail
T addressof(T... args)
Provides the bidirectional seqan3::bi_fm_index.
T clear(T... args)
T empty(T... args)
constexpr auto to_rank
Return the rank representation of a (semi-)alphabet object.
Definition: concept.hpp:155
auto search(queries_t &&queries, index_t const &index, configuration_t const &cfg=search_cfg::default_configuration)
Search a query or a range of queries in an index.
Definition: search.hpp:104
constexpr size_t size
The size of a type pack.
Definition: traits.hpp:151
constexpr auto slice
A view adaptor that returns a half-open interval on the underlying range.
Definition: slice.hpp:183
T max(T... args)
T push_back(T... args)
Provides the concept for seqan3::detail::sdsl_index.
Provides data structures used by different search algorithms.
Provides the data structures and precomputed instances for (optimum) search schemes.
Provides seqan3::detail::search_traits.
Provides seqan3::views::slice.
Provides seqan3::detail::transformation_trait_or.