SeqAn3 3.2.0
The Modern C++ library for sequence analysis.
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
search_scheme_algorithm.hpp
Go to the documentation of this file.
1// -----------------------------------------------------------------------------------------------------
2// Copyright (c) 2006-2022, Knut Reinert & Freie Universität Berlin
3// Copyright (c) 2016-2022, Knut Reinert & MPI für molekulare Genetik
4// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6// -----------------------------------------------------------------------------------------------------
7
13#pragma once
14
15#include <type_traits>
16
24
25namespace seqan3::detail
26{
27
34template <typename configuration_t, typename index_t, typename... policies_t>
35 requires (template_specialisation_of<typename index_t::cursor_type, bi_fm_index_cursor>)
36class search_scheme_algorithm : protected policies_t...
37{
38private:
40 using traits_t = search_traits<configuration_t>;
42 using search_result_type = typename traits_t::search_result_type;
43
44 static_assert(!std::same_as<search_result_type, empty_type>, "The search result type was not configured.");
45
46public:
50 search_scheme_algorithm() = default;
51 search_scheme_algorithm(search_scheme_algorithm const &) = default;
52 search_scheme_algorithm(search_scheme_algorithm &&) = default;
53 search_scheme_algorithm & operator=(search_scheme_algorithm const &) = default;
54 search_scheme_algorithm & operator=(search_scheme_algorithm &&) = default;
55 ~search_scheme_algorithm() = default;
56
68 search_scheme_algorithm(configuration_t const & cfg, index_t const & index) : policies_t{cfg}...
69 {
70 stratum = cfg.get_or(search_cfg::hit_strata{0}).stratum;
71 index_ptr = std::addressof(index);
72 }
74
95 template <tuple_like indexed_query_t, typename callback_t>
96 requires (std::tuple_size_v<indexed_query_t> == 2)
97 && std::ranges::forward_range<std::tuple_element_t<1, indexed_query_t>>
98 && std::invocable<callback_t, search_result_type>
99 void operator()(indexed_query_t && indexed_query, callback_t && callback)
100 {
101 auto && [query_idx, query] = indexed_query;
102 auto error_state = this->max_error_counts(query); // see policy_max_error
103
104 // construct internal delegate for collecting hits for later filtering (if necessary)
106 auto on_hit_delegate = [&internal_hits](auto const & it)
107 {
108 internal_hits.push_back(it);
109 };
110
111 perform_search_by_hit_strategy(internal_hits, query, error_state, on_hit_delegate);
112
113 // Invoke the callback on the generated result.
114 this->make_results(std::move(internal_hits), query_idx, callback); // see policy_search_result_builder
115 }
116
117private:
119 index_t const * index_ptr{nullptr};
120
122 uint8_t stratum{};
123
124 // forward declaration
125 template <bool abort_on_hit, typename query_t, typename delegate_t>
126 inline void search_algo_bi(query_t & query, search_param const error_left, delegate_t && delegate);
127
135 template <typename query_t, typename delegate_t>
136 void perform_search_by_hit_strategy(std::vector<typename index_t::cursor_type> & internal_hits,
137 query_t & query,
138 search_param error_state,
139 delegate_t const & on_hit_delegate)
140 {
141 if constexpr (!traits_t::search_all_hits)
142 {
143 auto max_total = error_state.total;
144 error_state.total = 0; // start search with less errors
145 while (internal_hits.empty() && error_state.total <= max_total)
146 {
147 // * If you only want the best hit (traits_t::search_single_best_hit), you stop after finding the
148 // first hit, the hit with the least errors (`abort_on_hit` is true).
149 // * If you are in strata mode (traits_t::search_strata_hits), you do the same as with best hits,
150 // but then do the extra step afterwards (`abort_on_hit` is true).
151 // * If you want all best hits (traits_t::search_all_best_hits), you do not stop after the first
152 // hit but continue the current search algorithm/max_error pattern (`abort_on_hit` is true).
153 constexpr bool abort_on_hit = !traits_t::search_all_best_hits;
154 search_algo_bi<abort_on_hit>(query, error_state, on_hit_delegate);
155 ++error_state.total;
156 }
157 if constexpr (traits_t::search_strata_hits)
158 {
159 if (!internal_hits.empty())
160 {
161 internal_hits.clear(); // TODO:don't clear when using Optimum Search Schemes with lower error bounds
162 error_state.total += stratum - 1;
163 search_algo_bi<false>(query, error_state, on_hit_delegate);
164 }
165 }
166 }
167 else // detail::search_mode_all
168 {
169 // If you want to find all hits, you cannot stop once you found any hit (<false>)
170 // since you have to find all paths in the search tree that satisfy the hit condition.
171 search_algo_bi<false>(query, error_state, on_hit_delegate);
172 }
173 }
174};
175
189inline std::vector<search_dyn> compute_ss(uint8_t const min_error, uint8_t const max_error)
190{
191 // TODO: Replace this at least by the pigeonhole principle or even better by 01*0 schemes.
192 // NOTE: Make sure that the searches are sorted by their asymptotical running time (i.e. upper error bound string),
193 // s.t. easy to compute searches come first. This improves the running time of algorithms that abort after the
194 // first hit (e.g. search strategy: best). Even though it is not guaranteed, this seems to be a good greedy
195 // approach.
196 std::vector<search_dyn> scheme{{{1}, {min_error}, {max_error}}};
197 return scheme;
198}
199
217template <typename search_scheme_t>
218inline auto search_scheme_block_info(search_scheme_t const & search_scheme, size_t const query_length)
219{
220 using blocks_length_type = typename search_scheme_t::value_type::blocks_length_type;
221
222 constexpr bool is_dyn_scheme = std::same_as<search_scheme_t, search_scheme_dyn_type>;
223
224 // Either store information in an array (for search schemes known at compile time) or in a vector otherwise.
225 using result_type = std::conditional_t<
226 is_dyn_scheme,
229 transformation_trait_or_t<std::tuple_size<search_scheme_t>, std::false_type>::value>>;
230
231 result_type result;
232 if constexpr (is_dyn_scheme)
233 result.resize(search_scheme.size());
234
235 uint8_t const blocks{search_scheme[0].blocks()};
236 size_t const block_length{query_length / blocks};
237 uint8_t const rest{static_cast<uint8_t>(query_length % blocks)};
238
239 blocks_length_type blocks_length;
240 // set all blocks_length values to block_length
241 // resp. block_length + 1 for the first `rest = block_length % blocks` values
242 if constexpr (is_dyn_scheme)
243 blocks_length.resize(blocks, block_length);
244 else
245 blocks_length.fill(block_length);
246
247 for (uint8_t block_id = 0; block_id < rest; ++block_id)
248 ++blocks_length[block_id];
249
250 for (uint8_t search_id = 0; search_id < search_scheme.size(); ++search_id)
251 {
252 auto const & search = search_scheme[search_id];
253
254 auto & [search_blocks_length, start_pos] = result[search_id];
255
256 // compute cumulative blocks_length and starting position
257 start_pos = 0;
258 if constexpr (is_dyn_scheme)
259 search_blocks_length.resize(blocks);
260 search_blocks_length[0] = blocks_length[search.pi[0] - 1];
261 for (uint8_t i = 1; i < blocks; ++i)
262 {
263 search_blocks_length[i] = blocks_length[search.pi[i] - 1] + search_blocks_length[i - 1];
264 if (search.pi[i] < search.pi[0])
265 start_pos += search_blocks_length[i] - search_blocks_length[i - 1];
266 }
267 }
268
269 return result;
270}
271
273// forward declaration
274template <bool abort_on_hit,
275 typename cursor_t,
276 typename query_t,
277 typename search_t,
278 typename blocks_length_t,
279 typename delegate_t>
280inline bool search_ss(cursor_t cur,
281 query_t & query,
282 typename cursor_t::size_type const lb,
283 typename cursor_t::size_type const rb,
284 uint8_t const errors_spent,
285 uint8_t const block_id,
286 bool const go_right,
287 search_t const & search,
288 blocks_length_t const & blocks_length,
289 search_param const error_left,
290 delegate_t && delegate);
292
324template <bool abort_on_hit,
325 typename cursor_t,
326 typename query_t,
327 typename search_t,
328 typename blocks_length_t,
329 typename delegate_t>
330inline bool search_ss_exact(cursor_t cur,
331 query_t & query,
332 typename cursor_t::size_type const lb,
333 typename cursor_t::size_type const rb,
334 uint8_t const errors_spent,
335 uint8_t const block_id,
336 bool const go_right,
337 search_t const & search,
338 blocks_length_t const & blocks_length,
339 search_param const error_left,
340 delegate_t && delegate)
341{
342 using size_type = typename cursor_t::size_type;
343
344 uint8_t const block_id2 = std::min<uint8_t>(block_id + 1, search.blocks() - 1);
345 bool const go_right2 = (block_id < search.blocks() - 1) && (search.pi[block_id + 1] > search.pi[block_id]);
346
347 if (go_right)
348 {
349 size_type const infix_lb = rb - 1; // inclusive
350 size_type const infix_rb = lb + blocks_length[block_id] - 1; // exclusive
351
352 if (!cur.extend_right(query | views::slice(infix_lb, infix_rb + 1)))
353 return false;
354
355 if (search_ss<abort_on_hit>(cur,
356 query,
357 lb,
358 infix_rb + 2,
359 errors_spent,
360 block_id2,
361 go_right2,
362 search,
363 blocks_length,
364 error_left,
365 delegate)
366 && abort_on_hit)
367 {
368 return true;
369 }
370 }
371 else
372 {
373 size_type const infix_lb = rb - blocks_length[block_id] - 1; // inclusive
374 size_type const infix_rb = lb - 1; // inclusive
375
376 if (!cur.extend_left(query | views::slice(infix_lb, infix_rb + 1)))
377 return false;
378
379 if (search_ss<abort_on_hit>(cur,
380 query,
381 infix_lb,
382 rb,
383 errors_spent,
384 block_id2,
385 go_right2,
386 search,
387 blocks_length,
388 error_left,
389 delegate)
390 && abort_on_hit)
391 {
392 return true;
393 }
394 }
395 return false;
396}
397
404template <bool abort_on_hit,
405 typename cursor_t,
406 typename query_t,
407 typename search_t,
408 typename blocks_length_t,
409 typename delegate_t>
410inline bool search_ss_deletion(cursor_t cur,
411 query_t & query,
412 typename cursor_t::size_type const lb,
413 typename cursor_t::size_type const rb,
414 uint8_t const errors_spent,
415 uint8_t const block_id,
416 bool const go_right,
417 search_t const & search,
418 blocks_length_t const & blocks_length,
419 search_param const error_left,
420 delegate_t && delegate)
421{
422 uint8_t const max_error_left_in_block = search.u[block_id] - errors_spent;
423 uint8_t const min_error_left_in_block = std::max(search.l[block_id] - errors_spent, 0);
424
425 // Switch to the next block when the min number of errors is reached
426 if (min_error_left_in_block == 0)
427 {
428 uint8_t const block_id2 = std::min<uint8_t>(block_id + 1, search.blocks() - 1);
429 bool const go_right2 = block_id2 == 0 ? true : search.pi[block_id2] > search.pi[block_id2 - 1];
430
431 if (search_ss<abort_on_hit>(cur,
432 query,
433 lb,
434 rb,
435 errors_spent,
436 block_id2,
437 go_right2,
438 search,
439 blocks_length,
440 error_left,
441 delegate)
442 && abort_on_hit)
443 {
444 return true;
445 }
446 }
447
448 // Insert deletions into the current block as long as possible
449 // Do not allow deletions at the beginning of the leftmost block
450 // Do not allow deletions at the end of the rightmost block
451 if (!(search.pi[block_id] == 1 && !go_right) && !(search.pi[block_id] == search.blocks() && go_right)
452 && max_error_left_in_block > 0 && error_left.total > 0 && error_left.deletion > 0
453 && ((go_right && cur.extend_right()) || (!go_right && cur.extend_left())))
454 {
455 search_param error_left2{error_left};
456 error_left2.total--;
457 error_left2.deletion--;
458 do
459 {
460 if (search_ss_deletion<abort_on_hit>(cur,
461 query,
462 lb,
463 rb,
464 errors_spent + 1,
465 block_id,
466 go_right,
467 search,
468 blocks_length,
469 error_left2,
470 delegate)
471 && abort_on_hit)
472 {
473 return true;
474 }
475 }
476 while ((go_right && cur.cycle_back()) || (!go_right && cur.cycle_front()));
477 }
478 return false;
479}
480
489template <bool abort_on_hit,
490 typename cursor_t,
491 typename query_t,
492 typename search_t,
493 typename blocks_length_t,
494 typename delegate_t>
495inline bool search_ss_children(cursor_t cur,
496 query_t & query,
497 typename cursor_t::size_type const lb,
498 typename cursor_t::size_type const rb,
499 uint8_t const errors_spent,
500 uint8_t const block_id,
501 bool const go_right,
502 uint8_t const min_error_left_in_block,
503 search_t const & search,
504 blocks_length_t const & blocks_length,
505 search_param const error_left,
506 delegate_t && delegate)
507{
508 using size_type = typename cursor_t::size_type;
509 if ((go_right && cur.extend_right()) || (!go_right && cur.extend_left()))
510 {
511 size_type const chars_left = blocks_length[block_id] - (rb - lb - 1);
512
513 size_type lb2 = lb - !go_right;
514 size_type rb2 = rb + go_right;
515
516 do
517 {
518 bool const delta = cur.last_rank() != to_rank(query[(go_right ? rb : lb) - 1]);
519
520 // skip if there are more min errors left in the current block than characters in the block
521 // i.e. chars_left - 1 < min_error_left_in_block - delta
522 // TODO: move that outside the if / do-while struct
523 // TODO: incorporate error_left.deletion into formula
524 if (error_left.deletion == 0 && chars_left + delta < min_error_left_in_block + 1u)
525 continue;
526
527 if (!delta || error_left.substitution > 0)
528 {
529 search_param error_left2{error_left};
530 error_left2.total -= delta;
531 error_left2.substitution -= delta;
532
533 // At the end of the current block
534 if (rb - lb == blocks_length[block_id])
535 {
536 // Leave the possibility for one or multiple deletions at the end of a block.
537 // Thus do not change the direction (go_right) yet.
538 if (error_left.deletion > 0)
539 {
540 if (search_ss_deletion<abort_on_hit>(cur,
541 query,
542 lb2,
543 rb2,
544 errors_spent + delta,
545 block_id,
546 go_right,
547 search,
548 blocks_length,
549 error_left2,
550 delegate)
551 && abort_on_hit)
552 {
553 return true;
554 }
555 }
556 else
557 {
558 uint8_t const block_id2 = std::min<uint8_t>(block_id + 1, search.blocks() - 1);
559 bool const go_right2 = block_id2 == 0 ? true : search.pi[block_id2] > search.pi[block_id2 - 1];
560
561 if (search_ss<abort_on_hit>(cur,
562 query,
563 lb2,
564 rb2,
565 errors_spent + delta,
566 block_id2,
567 go_right2,
568 search,
569 blocks_length,
570 error_left2,
571 delegate)
572 && abort_on_hit)
573 {
574 return true;
575 }
576 }
577 }
578 else
579 {
580 if (search_ss<abort_on_hit>(cur,
581 query,
582 lb2,
583 rb2,
584 errors_spent + delta,
585 block_id,
586 go_right,
587 search,
588 blocks_length,
589 error_left2,
590 delegate)
591 && abort_on_hit)
592 {
593 return true;
594 }
595 }
596 }
597
598 // Deletion
599 // TODO: check whether the conditions for deletions at the beginning/end of the query are really necessary
600 // No deletion at the beginning of the leftmost block.
601 // No deletion at the end of the rightmost block.
602 if (error_left.deletion > 0 && !(go_right && (rb == 1 || rb == std::ranges::size(query) + 1))
603 && !(!go_right && (lb == 0 || lb == std::ranges::size(query))))
604 {
605 search_param error_left3{error_left};
606 error_left3.total--;
607 error_left3.deletion--;
608 search_ss<abort_on_hit>(cur,
609 query,
610 lb,
611 rb,
612 errors_spent + 1,
613 block_id,
614 go_right,
615 search,
616 blocks_length,
617 error_left3,
618 delegate);
619 }
620 }
621 while ((go_right && cur.cycle_back()) || (!go_right && cur.cycle_front()));
622 }
623 return false;
624}
625
631template <bool abort_on_hit,
632 typename cursor_t,
633 typename query_t,
634 typename search_t,
635 typename blocks_length_t,
636 typename delegate_t>
637inline bool search_ss(cursor_t cur,
638 query_t & query,
639 typename cursor_t::size_type const lb,
640 typename cursor_t::size_type const rb,
641 uint8_t const errors_spent,
642 uint8_t const block_id,
643 bool const go_right,
644 search_t const & search,
645 blocks_length_t const & blocks_length,
646 search_param const error_left,
647 delegate_t && delegate)
648{
649 uint8_t const max_error_left_in_block = search.u[block_id] - errors_spent;
650 uint8_t const min_error_left_in_block = std::max(search.l[block_id] - errors_spent, 0); // NOTE: changed
651
652 // Done.
653 if (min_error_left_in_block == 0 && lb == 0 && rb == std::ranges::size(query) + 1)
654 {
655 delegate(cur);
656 return true;
657 }
658 // Exact search in current block.
659 else if (((max_error_left_in_block == 0) && (rb - lb - 1 != blocks_length[block_id]))
660 || (error_left.total == 0 && min_error_left_in_block == 0))
661 {
662 if (search_ss_exact<abort_on_hit>(cur,
663 query,
664 lb,
665 rb,
666 errors_spent,
667 block_id,
668 go_right,
669 search,
670 blocks_length,
671 error_left,
672 delegate)
673 && abort_on_hit)
674 {
675 return true;
676 }
677 }
678 // Approximate search in current block.
679 // i.e. blocks_length[block_id] - (rb - lb - (lb != rb)) >= min_error_left_in_block
680 else if (error_left.total > 0)
681 {
682 // Insertion
683 if (error_left.insertion > 0)
684 {
685 using size_type = typename cursor_t::size_type;
686
687 size_type const lb2 = lb - !go_right;
688 size_type const rb2 = rb + go_right;
689
690 search_param error_left2{error_left};
691 error_left2.total--;
692 error_left2.insertion--;
693 // At the end of the current block
694 if (rb - lb == blocks_length[block_id])
695 {
696 // Leave the possibility for one or multiple deletions at the end of a block.
697 // Thus do not change the direction (go_right) yet.
698 // TODO: benchmark the improvement on preventing insertions followed by a deletion and vice versa. Does
699 // it pay off the additional complexity and documentation for the user? (Note that the user might only
700 // allow for insertions and deletion and not for mismatches).
701 if (search_ss_deletion<abort_on_hit>(cur,
702 query,
703 lb2,
704 rb2,
705 errors_spent + 1,
706 block_id,
707 go_right,
708 search,
709 blocks_length,
710 error_left2,
711 delegate)
712 && abort_on_hit)
713 {
714 return true;
715 }
716 }
717 else
718 {
719 if (search_ss<abort_on_hit>(cur,
720 query,
721 lb2,
722 rb2,
723 errors_spent + 1,
724 block_id,
725 go_right,
726 search,
727 blocks_length,
728 error_left2,
729 delegate)
730 && abort_on_hit)
731 {
732 return true;
733 }
734 }
735 }
736 if (search_ss_children<abort_on_hit>(cur,
737 query,
738 lb,
739 rb,
740 errors_spent,
741 block_id,
742 go_right,
743 min_error_left_in_block,
744 search,
745 blocks_length,
746 error_left,
747 delegate)
748 && abort_on_hit)
749 {
750 return true;
751 }
752 }
753 return false;
754}
755
779template <bool abort_on_hit, typename index_t, typename query_t, typename search_scheme_t, typename delegate_t>
780inline void search_ss(index_t const & index,
781 query_t & query,
782 search_param const error_left,
783 search_scheme_t const & search_scheme,
784 delegate_t && delegate)
785{
786 // retrieve cumulative block lengths and starting position
787 auto const block_info = search_scheme_block_info(search_scheme, std::ranges::size(query));
788
789 for (uint8_t search_id = 0; search_id < search_scheme.size(); ++search_id)
790 {
791 auto const & search = search_scheme[search_id];
792 auto const & [blocks_length, start_pos] = block_info[search_id];
793
794 bool const hit = search_ss<abort_on_hit>(index.cursor(), // cursor on the index
795 query, // query to be searched
796 start_pos,
797 start_pos + 1, // infix range already searched (open interval)
798 // the first character of `query` has the index 1 (not 0)
799 0, // errors spent
800 0, // current block id in search scheme
801 true, // search the first block from left to right
802 search,
803 blocks_length, // search scheme information
804 error_left, // errors left (broken down by error types)
805 delegate // delegate function called on hit
806 );
807
808 if (abort_on_hit && hit)
809 return;
810 }
811}
812
831template <typename configuration_t, typename index_t, typename... policies_t>
832 requires (template_specialisation_of<typename index_t::cursor_type, bi_fm_index_cursor>)
833template <bool abort_on_hit, typename query_t, typename delegate_t>
834inline void search_scheme_algorithm<configuration_t, index_t, policies_t...>::search_algo_bi(
835 query_t & query,
836 search_param const error_left,
837 delegate_t && delegate)
838{
839 switch (error_left.total)
840 {
841 case 0:
842 search_ss<abort_on_hit>(*index_ptr, query, error_left, optimum_search_scheme<0, 0>, delegate);
843 break;
844 case 1:
845 search_ss<abort_on_hit>(*index_ptr, query, error_left, optimum_search_scheme<0, 1>, delegate);
846 break;
847 case 2:
848 search_ss<abort_on_hit>(*index_ptr, query, error_left, optimum_search_scheme<0, 2>, delegate);
849 break;
850 case 3:
851 search_ss<abort_on_hit>(*index_ptr, query, error_left, optimum_search_scheme<0, 3>, delegate);
852 break;
853 default:
854 auto const & search_scheme{compute_ss(0, error_left.total)};
855 search_ss<abort_on_hit>(*index_ptr, query, error_left, search_scheme, delegate);
856 break;
857 }
858}
859
860} // namespace seqan3::detail
T addressof(T... args)
Provides the bidirectional seqan3::bi_fm_index.
T clear(T... args)
T empty(T... args)
constexpr auto to_rank
Return the rank representation of a (semi-)alphabet object.
Definition: concept.hpp:155
auto search(queries_t &&queries, index_t const &index, configuration_t const &cfg=search_cfg::default_configuration)
Search a query or a range of queries in an index.
Definition: search.hpp:103
constexpr size_t size
The size of a type pack.
Definition: traits.hpp:146
constexpr auto slice
A view adaptor that returns a half-open interval on the underlying range.
Definition: slice.hpp:178
T max(T... args)
T push_back(T... args)
Provides the concept for seqan3::detail::sdsl_index.
Provides data structures used by different search algorithms.
Provides the data structures and precomputed instances for (optimum) search schemes.
Provides seqan3::detail::search_traits.
Provides seqan3::views::slice.
Provides seqan3::detail::transformation_trait_or.