SeqAn3  3.0.2
The Modern C++ library for sequence analysis.
search_scheme_algorithm.hpp
Go to the documentation of this file.
1 // -----------------------------------------------------------------------------------------------------
2 // Copyright (c) 2006-2020, Knut Reinert & Freie Universität Berlin
3 // Copyright (c) 2016-2020, Knut Reinert & MPI für molekulare Genetik
4 // This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5 // shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6 // -----------------------------------------------------------------------------------------------------
7 
13 #pragma once
14 
15 #include <type_traits>
16 
23 
24 namespace seqan3::detail
25 {
26 
35 template <typename configuration_t, bi_fm_index_specialisation index_t, typename ...policies_t>
36 class search_scheme_algorithm : protected policies_t...
37 {
38 private:
40  using traits_t = search_traits<configuration_t>;
42  using search_result_type = typename traits_t::search_result_type;
43 
44  static_assert(!std::same_as<search_result_type, empty_type>, "The search result type was not configured.");
45 
46 public:
50  search_scheme_algorithm() = default;
51  search_scheme_algorithm(search_scheme_algorithm const &) = default;
52  search_scheme_algorithm(search_scheme_algorithm &&) = default;
53  search_scheme_algorithm & operator=(search_scheme_algorithm const &) = default;
54  search_scheme_algorithm & operator=(search_scheme_algorithm &&) = default;
55  ~search_scheme_algorithm() = default;
56 
67  search_scheme_algorithm(configuration_t const & cfg, index_t const & index) : policies_t{cfg}...
68  {
69  stratum = cfg.get_or(search_cfg::hit_strata{0}).stratum;
70  index_ptr = std::addressof(index);
71  }
73 
94  template <tuple_like indexed_query_t, typename callback_t>
96  requires (std::tuple_size_v<indexed_query_t> == 2) &&
97  std::ranges::forward_range<std::tuple_element_t<1, indexed_query_t>> &&
98  std::invocable<callback_t, search_result_type>
100  void operator()(indexed_query_t && indexed_query, callback_t && callback)
101  {
102  auto && [query_idx, query] = indexed_query;
103  auto error_state = this->max_error_counts(query); // see policy_max_error
104 
105  // construct internal delegate for collecting hits for later filtering (if necessary)
107  auto on_hit_delegate = [&internal_hits] (auto const & it)
108  {
109  internal_hits.push_back(it);
110  };
111 
112  perform_search_by_hit_strategy(internal_hits, query, error_state, on_hit_delegate);
113 
114  // Invoke the callback on the generated result.
115  this->make_results(std::move(internal_hits), query_idx, callback); // see policy_search_result_builder
116  }
117 
118 private:
120  index_t const * index_ptr{nullptr};
121 
123  uint8_t stratum{};
124 
125  // forward declaration
126  template <bool abort_on_hit, typename query_t, typename delegate_t>
127  inline void search_algo_bi(query_t & query, search_param const error_left, delegate_t && delegate);
128 
136  template <typename query_t, typename delegate_t>
137  void perform_search_by_hit_strategy(std::vector<typename index_t::cursor_type> & internal_hits,
138  query_t & query,
139  search_param error_state,
140  delegate_t const & on_hit_delegate)
141  {
142  if constexpr (!traits_t::search_all_hits)
143  {
144  auto max_total = error_state.total;
145  error_state.total = 0; // start search with less errors
146  while (internal_hits.empty() && error_state.total <= max_total)
147  {
148  // * If you only want the best hit (traits_t::search_single_best_hit), you stop after finding the
149  // first hit, the hit with the least errors (`abort_on_hit` is true).
150  // * If you are in strata mode (traits_t::search_strata_hits), you do the same as with best hits,
151  // but then do the extra step afterwards (`abort_on_hit` is true).
152  // * If you want all best hits (traits_t::search_all_best_hits), you do not stop after the first
153  // hit but continue the current search algorithm/max_error pattern (`abort_on_hit` is true).
154  constexpr bool abort_on_hit = !traits_t::search_all_best_hits;
155  search_algo_bi<abort_on_hit>(query, error_state, on_hit_delegate);
156  ++error_state.total;
157  }
158  if constexpr (traits_t::search_strata_hits)
159  {
160  if (!internal_hits.empty())
161  {
162  internal_hits.clear(); // TODO:don't clear when using Optimum Search Schemes with lower error bounds
163  error_state.total += stratum - 1;
164  search_algo_bi<false>(query, error_state, on_hit_delegate);
165  }
166  }
167  }
168  else // detail::search_mode_all
169  {
170  // If you want to find all hits, you cannot stop once you found any hit (<false>)
171  // since you have to find all paths in the search tree that satisfy the hit condition.
172  search_algo_bi<false>(query, error_state, on_hit_delegate);
173  }
174  }
175 };
176 
189 inline std::vector<search_dyn> compute_ss(uint8_t const min_error, uint8_t const max_error)
190 {
191  // TODO: Replace this at least by the pigeonhole principle or even better by 01*0 schemes.
192  // NOTE: Make sure that the searches are sorted by their asymptotical running time (i.e. upper error bound string),
193  // s.t. easy to compute searches come first. This improves the running time of algorithms that abort after the
194  // first hit (e.g. search strategy: best). Even though it is not guaranteed, this seems to be a good greedy
195  // approach.
196  std::vector<search_dyn> scheme{{{1}, {min_error}, {max_error}}};
197  return scheme;
198 }
199 
216 template <typename search_scheme_t>
217 inline auto search_scheme_block_info(search_scheme_t const & search_scheme, size_t const query_length)
218 {
219  using blocks_length_type = typename search_scheme_t::value_type::blocks_length_type;
220 
221  bool constexpr is_dyn_scheme = std::same_as<search_scheme_t, search_scheme_dyn_type>;
222 
223  // Either store information in an array (for search schemes known at compile time) or in a vector otherwise.
224  using result_type = std::conditional_t<is_dyn_scheme,
227  transformation_trait_or_t<std::tuple_size<search_scheme_t>,
228  std::false_type>::value>>;
229 
230  result_type result;
231  if constexpr (is_dyn_scheme)
232  result.resize(search_scheme.size());
233 
234  uint8_t const blocks {search_scheme[0].blocks()};
235  size_t const block_length{query_length / blocks};
236  uint8_t const rest {static_cast<uint8_t>(query_length % blocks)};
237 
238  blocks_length_type blocks_length;
239  // set all blocks_length values to block_length
240  // resp. block_length + 1 for the first `rest = block_length % blocks` values
241  if constexpr (is_dyn_scheme)
242  blocks_length.resize(blocks, block_length);
243  else
244  blocks_length.fill(block_length);
245 
246  for (uint8_t block_id = 0; block_id < rest; ++block_id)
247  ++blocks_length[block_id];
248 
249  for (uint8_t search_id = 0; search_id < search_scheme.size(); ++search_id)
250  {
251  auto const & search = search_scheme[search_id];
252 
253  auto & [search_blocks_length, start_pos] = result[search_id];
254 
255  // compute cumulative blocks_length and starting position
256  start_pos = 0;
257  if constexpr (is_dyn_scheme)
258  search_blocks_length.resize(blocks);
259  search_blocks_length[0] = blocks_length[search.pi[0] - 1];
260  for (uint8_t i = 1; i < blocks; ++i)
261  {
262  search_blocks_length[i] = blocks_length[search.pi[i] - 1] + search_blocks_length[i - 1];
263  if (search.pi[i] < search.pi[0])
264  start_pos += search_blocks_length[i] - search_blocks_length[i - 1];
265  }
266  }
267 
268  return result;
269 }
270 
272 // forward declaration
273 template <bool abort_on_hit, typename cursor_t, typename query_t, typename search_t, typename blocks_length_t,
274  typename delegate_t>
275 inline bool search_ss(cursor_t cur, query_t & query,
276  typename cursor_t::size_type const lb, typename cursor_t::size_type const rb,
277  uint8_t const errors_spent, uint8_t const block_id, bool const go_right, search_t const & search,
278  blocks_length_t const & blocks_length, search_param const error_left, delegate_t && delegate);
280 
311 template <bool abort_on_hit, typename cursor_t, typename query_t, typename search_t, typename blocks_length_t,
312  typename delegate_t>
313 inline bool search_ss_exact(cursor_t cur, query_t & query,
314  typename cursor_t::size_type const lb, typename cursor_t::size_type const rb,
315  uint8_t const errors_spent, uint8_t const block_id, bool const go_right,
316  search_t const & search, blocks_length_t const & blocks_length,
317  search_param const error_left, delegate_t && delegate)
318 {
319  using size_type = typename cursor_t::size_type;
320 
321  uint8_t const block_id2 = std::min<uint8_t>(block_id + 1, search.blocks() - 1);
322  bool const go_right2 = (block_id < search.blocks() - 1) && (search.pi[block_id + 1] > search.pi[block_id]);
323 
324  if (go_right)
325  {
326  size_type const infix_lb = rb - 1; // inclusive
327  size_type const infix_rb = lb + blocks_length[block_id] - 1; // exclusive
328 
329  if (!cur.extend_right(query | views::slice(infix_lb, infix_rb + 1)))
330  return false;
331 
332  if (search_ss<abort_on_hit>(cur, query, lb, infix_rb + 2, errors_spent, block_id2, go_right2, search,
333  blocks_length, error_left, delegate) && abort_on_hit)
334  {
335  return true;
336  }
337  }
338  else
339  {
340  size_type const infix_lb = rb - blocks_length[block_id] - 1; // inclusive
341  size_type const infix_rb = lb - 1; // inclusive
342 
343  if (!cur.extend_left(query | views::slice(infix_lb, infix_rb + 1)))
344  return false;
345 
346  if (search_ss<abort_on_hit>(cur, query, infix_lb, rb, errors_spent, block_id2, go_right2, search, blocks_length,
347  error_left, delegate) && abort_on_hit)
348  {
349  return true;
350  }
351  }
352  return false;
353 }
354 
360 template <bool abort_on_hit, typename cursor_t, typename query_t, typename search_t, typename blocks_length_t,
361  typename delegate_t>
362 inline bool search_ss_deletion(cursor_t cur, query_t & query,
363  typename cursor_t::size_type const lb, typename cursor_t::size_type const rb,
364  uint8_t const errors_spent, uint8_t const block_id, bool const go_right,
365  search_t const & search, blocks_length_t const & blocks_length,
366  search_param const error_left, delegate_t && delegate)
367 {
368  uint8_t const max_error_left_in_block = search.u[block_id] - errors_spent;
369  uint8_t const min_error_left_in_block = std::max(search.l[block_id] - errors_spent, 0);
370 
371  // Switch to the next block when the min number of errors is reached
372  if (min_error_left_in_block == 0)
373  {
374  uint8_t const block_id2 = std::min<uint8_t>(block_id + 1, search.blocks() - 1);
375  bool const go_right2 = block_id2 == 0 ? true : search.pi[block_id2] > search.pi[block_id2 - 1];
376 
377  if (search_ss<abort_on_hit>(cur, query, lb, rb, errors_spent, block_id2, go_right2, search, blocks_length,
378  error_left, delegate) && abort_on_hit)
379  {
380  return true;
381  }
382  }
383 
384  // Insert deletions into the current block as long as possible
385  // Do not allow deletions at the beginning of the leftmost block
386  // Do not allow deletions at the end of the rightmost block
387  if (!(search.pi[block_id] == 1 && !go_right) &&
388  !(search.pi[block_id] == search.blocks() && go_right) &&
389  max_error_left_in_block > 0 && error_left.total > 0 && error_left.deletion > 0 &&
390  ((go_right && cur.extend_right()) || (!go_right && cur.extend_left())))
391  {
392  search_param error_left2{error_left};
393  error_left2.total--;
394  error_left2.deletion--;
395  do
396  {
397  if (search_ss_deletion<abort_on_hit>(cur, query, lb, rb, errors_spent + 1, block_id, go_right, search,
398  blocks_length, error_left2, delegate) && abort_on_hit)
399  {
400  return true;
401  }
402  } while ((go_right && cur.cycle_back()) || (!go_right && cur.cycle_front()));
403  }
404  return false;
405 }
406 
414 template <bool abort_on_hit, typename cursor_t, typename query_t, typename search_t, typename blocks_length_t,
415  typename delegate_t>
416 inline bool search_ss_children(cursor_t cur, query_t & query,
417  typename cursor_t::size_type const lb, typename cursor_t::size_type const rb,
418  uint8_t const errors_spent, uint8_t const block_id, bool const go_right,
419  uint8_t const min_error_left_in_block, search_t const & search,
420  blocks_length_t const & blocks_length, search_param const error_left,
421  delegate_t && delegate)
422 {
423  using size_type = typename cursor_t::size_type;
424  if ((go_right && cur.extend_right()) || (!go_right && cur.extend_left()))
425  {
426  size_type const chars_left = blocks_length[block_id] - (rb - lb - 1);
427 
428  size_type lb2 = lb - !go_right;
429  size_type rb2 = rb + go_right;
430 
431  do
432  {
433  bool const delta = cur.last_rank() != to_rank(query[(go_right ? rb : lb) - 1]);
434 
435  // skip if there are more min errors left in the current block than characters in the block
436  // i.e. chars_left - 1 < min_error_left_in_block - delta
437  // TODO: move that outside the if / do-while struct
438  // TODO: incorporate error_left.deletion into formula
439  if (error_left.deletion == 0 && chars_left + delta < min_error_left_in_block + 1u)
440  continue;
441 
442  if (!delta || error_left.substitution > 0)
443  {
444  search_param error_left2{error_left};
445  error_left2.total -= delta;
446  error_left2.substitution -= delta;
447 
448  // At the end of the current block
449  if (rb - lb == blocks_length[block_id])
450  {
451  // Leave the possibility for one or multiple deletions at the end of a block.
452  // Thus do not change the direction (go_right) yet.
453  if (error_left.deletion > 0)
454  {
455  if (search_ss_deletion<abort_on_hit>(cur, query, lb2, rb2, errors_spent + delta, block_id,
456  go_right, search, blocks_length, error_left2, delegate) &&
457  abort_on_hit)
458  {
459  return true;
460  }
461  }
462  else
463  {
464  uint8_t const block_id2 = std::min<uint8_t>(block_id + 1, search.blocks() - 1);
465  bool const go_right2 = block_id2 == 0 ? true : search.pi[block_id2] > search.pi[block_id2 - 1];
466 
467  if (search_ss<abort_on_hit>(cur, query, lb2, rb2, errors_spent + delta, block_id2, go_right2,
468  search, blocks_length, error_left2, delegate) &&
469  abort_on_hit)
470  {
471  return true;
472  }
473  }
474  }
475  else
476  {
477  if (search_ss<abort_on_hit>(cur, query, lb2, rb2, errors_spent + delta, block_id, go_right, search,
478  blocks_length, error_left2, delegate) && abort_on_hit)
479  {
480  return true;
481  }
482  }
483  }
484 
485  // Deletion
486  // TODO: check whether the conditions for deletions at the beginning/end of the query are really necessary
487  // No deletion at the beginning of the leftmost block.
488  // No deletion at the end of the rightmost block.
489  if (error_left.deletion > 0 &&
490  !(go_right && (rb == 1 || rb == std::ranges::size(query) + 1)) &&
491  !(!go_right && (lb == 0 || lb == std::ranges::size(query))))
492  {
493  search_param error_left3{error_left};
494  error_left3.total--;
495  error_left3.deletion--;
496  search_ss<abort_on_hit>(cur, query, lb, rb, errors_spent + 1, block_id, go_right, search, blocks_length,
497  error_left3, delegate);
498  }
499  } while ((go_right && cur.cycle_back()) || (!go_right && cur.cycle_front()));
500  }
501  return false;
502 }
503 
508 template <bool abort_on_hit, typename cursor_t, typename query_t, typename search_t,
509  typename blocks_length_t, typename delegate_t>
510 inline bool search_ss(cursor_t cur, query_t & query,
511  typename cursor_t::size_type const lb, typename cursor_t::size_type const rb,
512  uint8_t const errors_spent, uint8_t const block_id, bool const go_right, search_t const & search,
513  blocks_length_t const & blocks_length, search_param const error_left, delegate_t && delegate)
514 {
515  uint8_t const max_error_left_in_block = search.u[block_id] - errors_spent;
516  uint8_t const min_error_left_in_block = std::max(search.l[block_id] - errors_spent, 0); // NOTE: changed
517 
518  // Done.
519  if (min_error_left_in_block == 0 && lb == 0 && rb == std::ranges::size(query) + 1)
520  {
521  delegate(cur);
522  return true;
523  }
524  // Exact search in current block.
525  else if (((max_error_left_in_block == 0) && (rb - lb - 1 != blocks_length[block_id])) ||
526  (error_left.total == 0 && min_error_left_in_block == 0))
527  {
528  if (search_ss_exact<abort_on_hit>(cur, query, lb, rb, errors_spent, block_id, go_right, search, blocks_length,
529  error_left, delegate) && abort_on_hit)
530  {
531  return true;
532  }
533  }
534  // Approximate search in current block.
535  // i.e. blocks_length[block_id] - (rb - lb - (lb != rb)) >= min_error_left_in_block
536  else if (error_left.total > 0)
537  {
538  // Insertion
539  if (error_left.insertion > 0)
540  {
541  using size_type = typename cursor_t::size_type;
542 
543  size_type const lb2 = lb - !go_right;
544  size_type const rb2 = rb + go_right;
545 
546  search_param error_left2{error_left};
547  error_left2.total--;
548  error_left2.insertion--;
549  // At the end of the current block
550  if (rb - lb == blocks_length[block_id])
551  {
552  // Leave the possibility for one or multiple deletions at the end of a block.
553  // Thus do not change the direction (go_right) yet.
554  // TODO: benchmark the improvement on preventing insertions followed by a deletion and vice versa. Does
555  // it pay off the additional complexity and documentation for the user? (Note that the user might only
556  // allow for insertions and deletion and not for mismatches).
557  if (search_ss_deletion<abort_on_hit>(cur, query, lb2, rb2, errors_spent + 1, block_id, go_right, search,
558  blocks_length, error_left2, delegate) && abort_on_hit)
559  {
560  return true;
561  }
562  }
563  else
564  {
565  if (search_ss<abort_on_hit>(cur, query, lb2, rb2, errors_spent + 1, block_id, go_right, search,
566  blocks_length, error_left2, delegate) && abort_on_hit)
567  {
568  return true;
569  }
570  }
571  }
572  if (search_ss_children<abort_on_hit>(cur, query, lb, rb, errors_spent, block_id, go_right,
573  min_error_left_in_block, search, blocks_length, error_left, delegate) &&
574  abort_on_hit)
575  {
576  return true;
577  }
578  }
579  return false;
580 }
581 
603 template <bool abort_on_hit, typename index_t, typename query_t, typename search_scheme_t, typename delegate_t>
604 inline void search_ss(index_t const & index, query_t & query, search_param const error_left,
605  search_scheme_t const & search_scheme, delegate_t && delegate)
606 {
607  // retrieve cumulative block lengths and starting position
608  auto const block_info = search_scheme_block_info(search_scheme, std::ranges::size(query));
609 
610  for (uint8_t search_id = 0; search_id < search_scheme.size(); ++search_id)
611  {
612  auto const & search = search_scheme[search_id];
613  auto const & [blocks_length, start_pos] = block_info[search_id];
614 
615  bool const hit = search_ss<abort_on_hit>(
616  index.cursor(), // cursor on the index
617  query, // query to be searched
618  start_pos, start_pos + 1, // infix range already searched (open interval)
619  // the first character of `query` has the index 1 (not 0)
620  0, // errors spent
621  0, // current block id in search scheme
622  true, // search the first block from left to right
623  search, blocks_length, // search scheme information
624  error_left, // errors left (broken down by error types)
625  delegate // delegate function called on hit
626  );
627 
628  if (abort_on_hit && hit)
629  return;
630  }
631 }
632 
650 template <typename configuration_t, typename index_t, typename ...policies_t>
651 template <bool abort_on_hit, typename query_t, typename delegate_t>
652 inline void search_scheme_algorithm<configuration_t, index_t, policies_t...>::search_algo_bi(
653  query_t & query,
654  search_param const error_left,
655  delegate_t && delegate)
656 {
657  switch (error_left.total)
658  {
659  case 0:
660  search_ss<abort_on_hit>(*index_ptr, query, error_left, optimum_search_scheme<0, 0>, delegate);
661  break;
662  case 1:
663  search_ss<abort_on_hit>(*index_ptr, query, error_left, optimum_search_scheme<0, 1>, delegate);
664  break;
665  case 2:
666  search_ss<abort_on_hit>(*index_ptr, query, error_left, optimum_search_scheme<0, 2>, delegate);
667  break;
668  case 3:
669  search_ss<abort_on_hit>(*index_ptr, query, error_left, optimum_search_scheme<0, 3>, delegate);
670  break;
671  default:
672  auto const & search_scheme{compute_ss(0, error_left.total)};
673  search_ss<abort_on_hit>(*index_ptr, query, error_left, search_scheme, delegate);
674  break;
675  }
676 }
678 
679 } // namespace seqan3::detail
std::false_type
search_scheme_precomputed.hpp
Provides the data structures and precomputed instances for (optimum) search schemes.
std::vector
seqan3::to_rank
constexpr auto to_rank
Return the rank representation of a (semi-)alphabet object.
Definition: concept.hpp:143
std::vector::clear
T clear(T... args)
std::vector::push_back
T push_back(T... args)
std::addressof
T addressof(T... args)
slice.hpp
Provides seqan3::views::slice.
seqan3::views::move
auto const move
A view that turns lvalue-references into rvalue-references.
Definition: move.hpp:68
transformation_trait_or.hpp
Provides seqan3::detail::transformation_trait_or.
std::array
seqan3::pack_traits::size
constexpr size_t size
The size of a type pack.
Definition: traits.hpp:116
bi_fm_index_specialisation
Concept for bidirectional FM indices.
seqan3::views::slice
constexpr auto slice
A view adaptor that returns a half-open interval on the underlying range.
Definition: slice.hpp:141
concept.hpp
Provides the concepts for seqan3::fm_index and seqan3::bi_fm_index and its traits and cursors.
std::vector::empty
T empty(T... args)
search_traits.hpp
Provides seqan3::detail::search_traits.
search_common.hpp
Provides data structures used by different search algorithms.
std::conditional_t
std::max
T max(T... args)
seqan3::search
auto search(queries_t &&queries, index_t const &index, configuration_t const &cfg=search_cfg::default_configuration)
Search a query or a range of queries in an index.
Definition: search.hpp:105