SeqAn3  3.0.3
The Modern C++ library for sequence analysis.
search_scheme_algorithm.hpp
Go to the documentation of this file.
1 // -----------------------------------------------------------------------------------------------------
2 // Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin
3 // Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik
4 // This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5 // shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6 // -----------------------------------------------------------------------------------------------------
7 
13 #pragma once
14 
15 #include <type_traits>
16 
24 
25 namespace seqan3::detail
26 {
27 
37 template <typename configuration_t, typename index_t, typename ...policies_t>
39  requires (template_specialisation_of<typename index_t::cursor_type, bi_fm_index_cursor>)
41 class search_scheme_algorithm : protected policies_t...
42 {
43 private:
45  using traits_t = search_traits<configuration_t>;
47  using search_result_type = typename traits_t::search_result_type;
48 
49  static_assert(!std::same_as<search_result_type, empty_type>, "The search result type was not configured.");
50 
51 public:
55  search_scheme_algorithm() = default;
56  search_scheme_algorithm(search_scheme_algorithm const &) = default;
57  search_scheme_algorithm(search_scheme_algorithm &&) = default;
58  search_scheme_algorithm & operator=(search_scheme_algorithm const &) = default;
59  search_scheme_algorithm & operator=(search_scheme_algorithm &&) = default;
60  ~search_scheme_algorithm() = default;
61 
73  search_scheme_algorithm(configuration_t const & cfg, index_t const & index) : policies_t{cfg}...
74  {
75  stratum = cfg.get_or(search_cfg::hit_strata{0}).stratum;
76  index_ptr = std::addressof(index);
77  }
79 
100  template <tuple_like indexed_query_t, typename callback_t>
102  requires (std::tuple_size_v<indexed_query_t> == 2) &&
103  std::ranges::forward_range<std::tuple_element_t<1, indexed_query_t>> &&
104  std::invocable<callback_t, search_result_type>
106  void operator()(indexed_query_t && indexed_query, callback_t && callback)
107  {
108  auto && [query_idx, query] = indexed_query;
109  auto error_state = this->max_error_counts(query); // see policy_max_error
110 
111  // construct internal delegate for collecting hits for later filtering (if necessary)
113  auto on_hit_delegate = [&internal_hits] (auto const & it)
114  {
115  internal_hits.push_back(it);
116  };
117 
118  perform_search_by_hit_strategy(internal_hits, query, error_state, on_hit_delegate);
119 
120  // Invoke the callback on the generated result.
121  this->make_results(std::move(internal_hits), query_idx, callback); // see policy_search_result_builder
122  }
123 
124 private:
126  index_t const * index_ptr{nullptr};
127 
129  uint8_t stratum{};
130 
131  // forward declaration
132  template <bool abort_on_hit, typename query_t, typename delegate_t>
133  inline void search_algo_bi(query_t & query, search_param const error_left, delegate_t && delegate);
134 
142  template <typename query_t, typename delegate_t>
143  void perform_search_by_hit_strategy(std::vector<typename index_t::cursor_type> & internal_hits,
144  query_t & query,
145  search_param error_state,
146  delegate_t const & on_hit_delegate)
147  {
148  if constexpr (!traits_t::search_all_hits)
149  {
150  auto max_total = error_state.total;
151  error_state.total = 0; // start search with less errors
152  while (internal_hits.empty() && error_state.total <= max_total)
153  {
154  // * If you only want the best hit (traits_t::search_single_best_hit), you stop after finding the
155  // first hit, the hit with the least errors (`abort_on_hit` is true).
156  // * If you are in strata mode (traits_t::search_strata_hits), you do the same as with best hits,
157  // but then do the extra step afterwards (`abort_on_hit` is true).
158  // * If you want all best hits (traits_t::search_all_best_hits), you do not stop after the first
159  // hit but continue the current search algorithm/max_error pattern (`abort_on_hit` is true).
160  constexpr bool abort_on_hit = !traits_t::search_all_best_hits;
161  search_algo_bi<abort_on_hit>(query, error_state, on_hit_delegate);
162  ++error_state.total;
163  }
164  if constexpr (traits_t::search_strata_hits)
165  {
166  if (!internal_hits.empty())
167  {
168  internal_hits.clear(); // TODO:don't clear when using Optimum Search Schemes with lower error bounds
169  error_state.total += stratum - 1;
170  search_algo_bi<false>(query, error_state, on_hit_delegate);
171  }
172  }
173  }
174  else // detail::search_mode_all
175  {
176  // If you want to find all hits, you cannot stop once you found any hit (<false>)
177  // since you have to find all paths in the search tree that satisfy the hit condition.
178  search_algo_bi<false>(query, error_state, on_hit_delegate);
179  }
180  }
181 };
182 
195 inline std::vector<search_dyn> compute_ss(uint8_t const min_error, uint8_t const max_error)
196 {
197  // TODO: Replace this at least by the pigeonhole principle or even better by 01*0 schemes.
198  // NOTE: Make sure that the searches are sorted by their asymptotical running time (i.e. upper error bound string),
199  // s.t. easy to compute searches come first. This improves the running time of algorithms that abort after the
200  // first hit (e.g. search strategy: best). Even though it is not guaranteed, this seems to be a good greedy
201  // approach.
202  std::vector<search_dyn> scheme{{{1}, {min_error}, {max_error}}};
203  return scheme;
204 }
205 
222 template <typename search_scheme_t>
223 inline auto search_scheme_block_info(search_scheme_t const & search_scheme, size_t const query_length)
224 {
225  using blocks_length_type = typename search_scheme_t::value_type::blocks_length_type;
226 
227  bool constexpr is_dyn_scheme = std::same_as<search_scheme_t, search_scheme_dyn_type>;
228 
229  // Either store information in an array (for search schemes known at compile time) or in a vector otherwise.
230  using result_type = std::conditional_t<is_dyn_scheme,
233  transformation_trait_or_t<std::tuple_size<search_scheme_t>,
234  std::false_type>::value>>;
235 
236  result_type result;
237  if constexpr (is_dyn_scheme)
238  result.resize(search_scheme.size());
239 
240  uint8_t const blocks {search_scheme[0].blocks()};
241  size_t const block_length{query_length / blocks};
242  uint8_t const rest {static_cast<uint8_t>(query_length % blocks)};
243 
244  blocks_length_type blocks_length;
245  // set all blocks_length values to block_length
246  // resp. block_length + 1 for the first `rest = block_length % blocks` values
247  if constexpr (is_dyn_scheme)
248  blocks_length.resize(blocks, block_length);
249  else
250  blocks_length.fill(block_length);
251 
252  for (uint8_t block_id = 0; block_id < rest; ++block_id)
253  ++blocks_length[block_id];
254 
255  for (uint8_t search_id = 0; search_id < search_scheme.size(); ++search_id)
256  {
257  auto const & search = search_scheme[search_id];
258 
259  auto & [search_blocks_length, start_pos] = result[search_id];
260 
261  // compute cumulative blocks_length and starting position
262  start_pos = 0;
263  if constexpr (is_dyn_scheme)
264  search_blocks_length.resize(blocks);
265  search_blocks_length[0] = blocks_length[search.pi[0] - 1];
266  for (uint8_t i = 1; i < blocks; ++i)
267  {
268  search_blocks_length[i] = blocks_length[search.pi[i] - 1] + search_blocks_length[i - 1];
269  if (search.pi[i] < search.pi[0])
270  start_pos += search_blocks_length[i] - search_blocks_length[i - 1];
271  }
272  }
273 
274  return result;
275 }
276 
278 // forward declaration
279 template <bool abort_on_hit, typename cursor_t, typename query_t, typename search_t, typename blocks_length_t,
280  typename delegate_t>
281 inline bool search_ss(cursor_t cur, query_t & query,
282  typename cursor_t::size_type const lb, typename cursor_t::size_type const rb,
283  uint8_t const errors_spent, uint8_t const block_id, bool const go_right, search_t const & search,
284  blocks_length_t const & blocks_length, search_param const error_left, delegate_t && delegate);
286 
317 template <bool abort_on_hit, typename cursor_t, typename query_t, typename search_t, typename blocks_length_t,
318  typename delegate_t>
319 inline bool search_ss_exact(cursor_t cur, query_t & query,
320  typename cursor_t::size_type const lb, typename cursor_t::size_type const rb,
321  uint8_t const errors_spent, uint8_t const block_id, bool const go_right,
322  search_t const & search, blocks_length_t const & blocks_length,
323  search_param const error_left, delegate_t && delegate)
324 {
325  using size_type = typename cursor_t::size_type;
326 
327  uint8_t const block_id2 = std::min<uint8_t>(block_id + 1, search.blocks() - 1);
328  bool const go_right2 = (block_id < search.blocks() - 1) && (search.pi[block_id + 1] > search.pi[block_id]);
329 
330  if (go_right)
331  {
332  size_type const infix_lb = rb - 1; // inclusive
333  size_type const infix_rb = lb + blocks_length[block_id] - 1; // exclusive
334 
335  if (!cur.extend_right(query | views::slice(infix_lb, infix_rb + 1)))
336  return false;
337 
338  if (search_ss<abort_on_hit>(cur, query, lb, infix_rb + 2, errors_spent, block_id2, go_right2, search,
339  blocks_length, error_left, delegate) && abort_on_hit)
340  {
341  return true;
342  }
343  }
344  else
345  {
346  size_type const infix_lb = rb - blocks_length[block_id] - 1; // inclusive
347  size_type const infix_rb = lb - 1; // inclusive
348 
349  if (!cur.extend_left(query | views::slice(infix_lb, infix_rb + 1)))
350  return false;
351 
352  if (search_ss<abort_on_hit>(cur, query, infix_lb, rb, errors_spent, block_id2, go_right2, search, blocks_length,
353  error_left, delegate) && abort_on_hit)
354  {
355  return true;
356  }
357  }
358  return false;
359 }
360 
366 template <bool abort_on_hit, typename cursor_t, typename query_t, typename search_t, typename blocks_length_t,
367  typename delegate_t>
368 inline bool search_ss_deletion(cursor_t cur, query_t & query,
369  typename cursor_t::size_type const lb, typename cursor_t::size_type const rb,
370  uint8_t const errors_spent, uint8_t const block_id, bool const go_right,
371  search_t const & search, blocks_length_t const & blocks_length,
372  search_param const error_left, delegate_t && delegate)
373 {
374  uint8_t const max_error_left_in_block = search.u[block_id] - errors_spent;
375  uint8_t const min_error_left_in_block = std::max(search.l[block_id] - errors_spent, 0);
376 
377  // Switch to the next block when the min number of errors is reached
378  if (min_error_left_in_block == 0)
379  {
380  uint8_t const block_id2 = std::min<uint8_t>(block_id + 1, search.blocks() - 1);
381  bool const go_right2 = block_id2 == 0 ? true : search.pi[block_id2] > search.pi[block_id2 - 1];
382 
383  if (search_ss<abort_on_hit>(cur, query, lb, rb, errors_spent, block_id2, go_right2, search, blocks_length,
384  error_left, delegate) && abort_on_hit)
385  {
386  return true;
387  }
388  }
389 
390  // Insert deletions into the current block as long as possible
391  // Do not allow deletions at the beginning of the leftmost block
392  // Do not allow deletions at the end of the rightmost block
393  if (!(search.pi[block_id] == 1 && !go_right) &&
394  !(search.pi[block_id] == search.blocks() && go_right) &&
395  max_error_left_in_block > 0 && error_left.total > 0 && error_left.deletion > 0 &&
396  ((go_right && cur.extend_right()) || (!go_right && cur.extend_left())))
397  {
398  search_param error_left2{error_left};
399  error_left2.total--;
400  error_left2.deletion--;
401  do
402  {
403  if (search_ss_deletion<abort_on_hit>(cur, query, lb, rb, errors_spent + 1, block_id, go_right, search,
404  blocks_length, error_left2, delegate) && abort_on_hit)
405  {
406  return true;
407  }
408  } while ((go_right && cur.cycle_back()) || (!go_right && cur.cycle_front()));
409  }
410  return false;
411 }
412 
420 template <bool abort_on_hit, typename cursor_t, typename query_t, typename search_t, typename blocks_length_t,
421  typename delegate_t>
422 inline bool search_ss_children(cursor_t cur, query_t & query,
423  typename cursor_t::size_type const lb, typename cursor_t::size_type const rb,
424  uint8_t const errors_spent, uint8_t const block_id, bool const go_right,
425  uint8_t const min_error_left_in_block, search_t const & search,
426  blocks_length_t const & blocks_length, search_param const error_left,
427  delegate_t && delegate)
428 {
429  using size_type = typename cursor_t::size_type;
430  if ((go_right && cur.extend_right()) || (!go_right && cur.extend_left()))
431  {
432  size_type const chars_left = blocks_length[block_id] - (rb - lb - 1);
433 
434  size_type lb2 = lb - !go_right;
435  size_type rb2 = rb + go_right;
436 
437  do
438  {
439  bool const delta = cur.last_rank() != to_rank(query[(go_right ? rb : lb) - 1]);
440 
441  // skip if there are more min errors left in the current block than characters in the block
442  // i.e. chars_left - 1 < min_error_left_in_block - delta
443  // TODO: move that outside the if / do-while struct
444  // TODO: incorporate error_left.deletion into formula
445  if (error_left.deletion == 0 && chars_left + delta < min_error_left_in_block + 1u)
446  continue;
447 
448  if (!delta || error_left.substitution > 0)
449  {
450  search_param error_left2{error_left};
451  error_left2.total -= delta;
452  error_left2.substitution -= delta;
453 
454  // At the end of the current block
455  if (rb - lb == blocks_length[block_id])
456  {
457  // Leave the possibility for one or multiple deletions at the end of a block.
458  // Thus do not change the direction (go_right) yet.
459  if (error_left.deletion > 0)
460  {
461  if (search_ss_deletion<abort_on_hit>(cur, query, lb2, rb2, errors_spent + delta, block_id,
462  go_right, search, blocks_length, error_left2, delegate) &&
463  abort_on_hit)
464  {
465  return true;
466  }
467  }
468  else
469  {
470  uint8_t const block_id2 = std::min<uint8_t>(block_id + 1, search.blocks() - 1);
471  bool const go_right2 = block_id2 == 0 ? true : search.pi[block_id2] > search.pi[block_id2 - 1];
472 
473  if (search_ss<abort_on_hit>(cur, query, lb2, rb2, errors_spent + delta, block_id2, go_right2,
474  search, blocks_length, error_left2, delegate) &&
475  abort_on_hit)
476  {
477  return true;
478  }
479  }
480  }
481  else
482  {
483  if (search_ss<abort_on_hit>(cur, query, lb2, rb2, errors_spent + delta, block_id, go_right, search,
484  blocks_length, error_left2, delegate) && abort_on_hit)
485  {
486  return true;
487  }
488  }
489  }
490 
491  // Deletion
492  // TODO: check whether the conditions for deletions at the beginning/end of the query are really necessary
493  // No deletion at the beginning of the leftmost block.
494  // No deletion at the end of the rightmost block.
495  if (error_left.deletion > 0 &&
496  !(go_right && (rb == 1 || rb == std::ranges::size(query) + 1)) &&
497  !(!go_right && (lb == 0 || lb == std::ranges::size(query))))
498  {
499  search_param error_left3{error_left};
500  error_left3.total--;
501  error_left3.deletion--;
502  search_ss<abort_on_hit>(cur, query, lb, rb, errors_spent + 1, block_id, go_right, search, blocks_length,
503  error_left3, delegate);
504  }
505  } while ((go_right && cur.cycle_back()) || (!go_right && cur.cycle_front()));
506  }
507  return false;
508 }
509 
514 template <bool abort_on_hit, typename cursor_t, typename query_t, typename search_t,
515  typename blocks_length_t, typename delegate_t>
516 inline bool search_ss(cursor_t cur, query_t & query,
517  typename cursor_t::size_type const lb, typename cursor_t::size_type const rb,
518  uint8_t const errors_spent, uint8_t const block_id, bool const go_right, search_t const & search,
519  blocks_length_t const & blocks_length, search_param const error_left, delegate_t && delegate)
520 {
521  uint8_t const max_error_left_in_block = search.u[block_id] - errors_spent;
522  uint8_t const min_error_left_in_block = std::max(search.l[block_id] - errors_spent, 0); // NOTE: changed
523 
524  // Done.
525  if (min_error_left_in_block == 0 && lb == 0 && rb == std::ranges::size(query) + 1)
526  {
527  delegate(cur);
528  return true;
529  }
530  // Exact search in current block.
531  else if (((max_error_left_in_block == 0) && (rb - lb - 1 != blocks_length[block_id])) ||
532  (error_left.total == 0 && min_error_left_in_block == 0))
533  {
534  if (search_ss_exact<abort_on_hit>(cur, query, lb, rb, errors_spent, block_id, go_right, search, blocks_length,
535  error_left, delegate) && abort_on_hit)
536  {
537  return true;
538  }
539  }
540  // Approximate search in current block.
541  // i.e. blocks_length[block_id] - (rb - lb - (lb != rb)) >= min_error_left_in_block
542  else if (error_left.total > 0)
543  {
544  // Insertion
545  if (error_left.insertion > 0)
546  {
547  using size_type = typename cursor_t::size_type;
548 
549  size_type const lb2 = lb - !go_right;
550  size_type const rb2 = rb + go_right;
551 
552  search_param error_left2{error_left};
553  error_left2.total--;
554  error_left2.insertion--;
555  // At the end of the current block
556  if (rb - lb == blocks_length[block_id])
557  {
558  // Leave the possibility for one or multiple deletions at the end of a block.
559  // Thus do not change the direction (go_right) yet.
560  // TODO: benchmark the improvement on preventing insertions followed by a deletion and vice versa. Does
561  // it pay off the additional complexity and documentation for the user? (Note that the user might only
562  // allow for insertions and deletion and not for mismatches).
563  if (search_ss_deletion<abort_on_hit>(cur, query, lb2, rb2, errors_spent + 1, block_id, go_right, search,
564  blocks_length, error_left2, delegate) && abort_on_hit)
565  {
566  return true;
567  }
568  }
569  else
570  {
571  if (search_ss<abort_on_hit>(cur, query, lb2, rb2, errors_spent + 1, block_id, go_right, search,
572  blocks_length, error_left2, delegate) && abort_on_hit)
573  {
574  return true;
575  }
576  }
577  }
578  if (search_ss_children<abort_on_hit>(cur, query, lb, rb, errors_spent, block_id, go_right,
579  min_error_left_in_block, search, blocks_length, error_left, delegate) &&
580  abort_on_hit)
581  {
582  return true;
583  }
584  }
585  return false;
586 }
587 
610 template <bool abort_on_hit, typename index_t, typename query_t, typename search_scheme_t, typename delegate_t>
611 inline void search_ss(index_t const & index, query_t & query, search_param const error_left,
612  search_scheme_t const & search_scheme, delegate_t && delegate)
613 {
614  // retrieve cumulative block lengths and starting position
615  auto const block_info = search_scheme_block_info(search_scheme, std::ranges::size(query));
616 
617  for (uint8_t search_id = 0; search_id < search_scheme.size(); ++search_id)
618  {
619  auto const & search = search_scheme[search_id];
620  auto const & [blocks_length, start_pos] = block_info[search_id];
621 
622  bool const hit = search_ss<abort_on_hit>(
623  index.cursor(), // cursor on the index
624  query, // query to be searched
625  start_pos, start_pos + 1, // infix range already searched (open interval)
626  // the first character of `query` has the index 1 (not 0)
627  0, // errors spent
628  0, // current block id in search scheme
629  true, // search the first block from left to right
630  search, blocks_length, // search scheme information
631  error_left, // errors left (broken down by error types)
632  delegate // delegate function called on hit
633  );
634 
635  if (abort_on_hit && hit)
636  return;
637  }
638 }
639 
657 template <typename configuration_t, typename index_t, typename ...policies_t>
658 template <bool abort_on_hit, typename query_t, typename delegate_t>
659 inline void search_scheme_algorithm<configuration_t, index_t, policies_t...>::search_algo_bi(
660  query_t & query,
661  search_param const error_left,
662  delegate_t && delegate)
663 {
664  switch (error_left.total)
665  {
666  case 0:
667  search_ss<abort_on_hit>(*index_ptr, query, error_left, optimum_search_scheme<0, 0>, delegate);
668  break;
669  case 1:
670  search_ss<abort_on_hit>(*index_ptr, query, error_left, optimum_search_scheme<0, 1>, delegate);
671  break;
672  case 2:
673  search_ss<abort_on_hit>(*index_ptr, query, error_left, optimum_search_scheme<0, 2>, delegate);
674  break;
675  case 3:
676  search_ss<abort_on_hit>(*index_ptr, query, error_left, optimum_search_scheme<0, 3>, delegate);
677  break;
678  default:
679  auto const & search_scheme{compute_ss(0, error_left.total)};
680  search_ss<abort_on_hit>(*index_ptr, query, error_left, search_scheme, delegate);
681  break;
682  }
683 }
685 
686 } // namespace seqan3::detail
T addressof(T... args)
Provides the bidirectional seqan3::bi_fm_index.
T clear(T... args)
T empty(T... args)
constexpr auto to_rank
Return the rank representation of a (semi-)alphabet object.
Definition: concept.hpp:155
auto search(queries_t &&queries, index_t const &index, configuration_t const &cfg=search_cfg::default_configuration)
Search a query or a range of queries in an index.
Definition: search.hpp:108
constexpr size_t size
The size of a type pack.
Definition: traits.hpp:151
constexpr auto slice
A view adaptor that returns a half-open interval on the underlying range.
Definition: slice.hpp:189
auto const move
A view that turns lvalue-references into rvalue-references.
Definition: move.hpp:74
T max(T... args)
T push_back(T... args)
Provides the concept for seqan3::detail::sdsl_index.
Provides data structures used by different search algorithms.
Provides the data structures and precomputed instances for (optimum) search schemes.
Provides seqan3::detail::search_traits.
Provides seqan3::detail::transformation_trait_or.
Provides seqan3::views::slice.