SeqAn3  3.0.1
The Modern C++ library for sequence analysis.
search_scheme_algorithm.hpp
Go to the documentation of this file.
1 // -----------------------------------------------------------------------------------------------------
2 // Copyright (c) 2006-2020, Knut Reinert & Freie Universität Berlin
3 // Copyright (c) 2016-2020, Knut Reinert & MPI für molekulare Genetik
4 // This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5 // shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6 // -----------------------------------------------------------------------------------------------------
7 
13 #pragma once
14 
15 #include <type_traits>
16 
23 
24 namespace seqan3::detail
25 {
26 
43 inline std::vector<search_dyn> compute_ss(uint8_t const min_error, uint8_t const max_error)
44 {
45  // TODO: Replace this at least by the pigeonhole principle or even better by 01*0 schemes.
46  // NOTE: Make sure that the searches are sorted by their asymptotical running time (i.e. upper error bound string),
47  // s.t. easy to compute searches come first. This improves the running time of algorithms that abort after the
48  // first hit (e.g. search mode: best). Even though it is not guaranteed, this seems to be a good greedy
49  // approach.
50  std::vector<search_dyn> scheme{{{1}, {min_error}, {max_error}}};
51  return scheme;
52 }
53 
70 template <typename search_scheme_t>
71 inline auto search_scheme_block_info(search_scheme_t const & search_scheme, size_t const query_length)
72 {
73  using blocks_length_type = typename search_scheme_t::value_type::blocks_length_type;
74 
75  bool constexpr is_dyn_scheme = std::same_as<search_scheme_t, search_scheme_dyn_type>;
76 
77  // Either store information in an array (for search schemes known at compile time) or in a vector otherwise.
78  using result_type = std::conditional_t<is_dyn_scheme,
81  transformation_trait_or_t<std::tuple_size<search_scheme_t>,
82  std::false_type>::value>>;
83 
84  result_type result;
85  if constexpr (is_dyn_scheme)
86  result.resize(search_scheme.size());
87 
88  uint8_t const blocks {search_scheme[0].blocks()};
89  size_t const block_length{query_length / blocks};
90  uint8_t const rest {static_cast<uint8_t>(query_length % blocks)};
91 
92  blocks_length_type blocks_length;
93  // set all blocks_length values to block_length
94  // resp. block_length + 1 for the first `rest = block_length % blocks` values
95  if constexpr (is_dyn_scheme)
96  blocks_length.resize(blocks, block_length);
97  else
98  blocks_length.fill(block_length);
99 
100  for (uint8_t block_id = 0; block_id < rest; ++block_id)
101  ++blocks_length[block_id];
102 
103  for (uint8_t search_id = 0; search_id < search_scheme.size(); ++search_id)
104  {
105  auto const & search = search_scheme[search_id];
106 
107  auto & [search_blocks_length, start_pos] = result[search_id];
108 
109  // compute cumulative blocks_length and starting position
110  start_pos = 0;
111  if constexpr (is_dyn_scheme)
112  search_blocks_length.resize(blocks);
113  search_blocks_length[0] = blocks_length[search.pi[0] - 1];
114  for (uint8_t i = 1; i < blocks; ++i)
115  {
116  search_blocks_length[i] = blocks_length[search.pi[i] - 1] + search_blocks_length[i - 1];
117  if (search.pi[i] < search.pi[0])
118  start_pos += search_blocks_length[i] - search_blocks_length[i - 1];
119  }
120  }
121 
122  return result;
123 }
124 
126 // forward declaration
127 template <bool abort_on_hit, typename cursor_t, typename query_t, typename search_t, typename blocks_length_t,
128  typename delegate_t>
129 inline bool search_ss(cursor_t cur, query_t & query,
130  typename cursor_t::size_type const lb, typename cursor_t::size_type const rb,
131  uint8_t const errors_spent, uint8_t const block_id, bool const go_right, search_t const & search,
132  blocks_length_t const & blocks_length, search_param const error_left, delegate_t && delegate);
134 
165 template <bool abort_on_hit, typename cursor_t, typename query_t, typename search_t, typename blocks_length_t,
166  typename delegate_t>
167 inline bool search_ss_exact(cursor_t cur, query_t & query,
168  typename cursor_t::size_type const lb, typename cursor_t::size_type const rb,
169  uint8_t const errors_spent, uint8_t const block_id, bool const go_right,
170  search_t const & search, blocks_length_t const & blocks_length,
171  search_param const error_left, delegate_t && delegate)
172 {
173  using size_type = typename cursor_t::size_type;
174 
175  uint8_t const block_id2 = std::min<uint8_t>(block_id + 1, search.blocks() - 1);
176  bool const go_right2 = (block_id < search.blocks() - 1) && (search.pi[block_id + 1] > search.pi[block_id]);
177 
178  if (go_right)
179  {
180  size_type const infix_lb = rb - 1; // inclusive
181  size_type const infix_rb = lb + blocks_length[block_id] - 1; // exclusive
182 
183  if (!cur.extend_right(query | views::slice(infix_lb, infix_rb + 1)))
184  return false;
185 
186  if (search_ss<abort_on_hit>(cur, query, lb, infix_rb + 2, errors_spent, block_id2, go_right2, search,
187  blocks_length, error_left, delegate) && abort_on_hit)
188  {
189  return true;
190  }
191  }
192  else
193  {
194  size_type const infix_lb = rb - blocks_length[block_id] - 1; // inclusive
195  size_type const infix_rb = lb - 1; // inclusive
196 
197  if (!cur.extend_left(query | views::slice(infix_lb, infix_rb + 1)))
198  return false;
199 
200  if (search_ss<abort_on_hit>(cur, query, infix_lb, rb, errors_spent, block_id2, go_right2, search, blocks_length,
201  error_left, delegate) && abort_on_hit)
202  {
203  return true;
204  }
205  }
206  return false;
207 }
208 
214 template <bool abort_on_hit, typename cursor_t, typename query_t, typename search_t, typename blocks_length_t,
215  typename delegate_t>
216 inline bool search_ss_deletion(cursor_t cur, query_t & query,
217  typename cursor_t::size_type const lb, typename cursor_t::size_type const rb,
218  uint8_t const errors_spent, uint8_t const block_id, bool const go_right,
219  search_t const & search, blocks_length_t const & blocks_length,
220  search_param const error_left, delegate_t && delegate)
221 {
222  uint8_t const max_error_left_in_block = search.u[block_id] - errors_spent;
223  uint8_t const min_error_left_in_block = std::max(search.l[block_id] - errors_spent, 0);
224 
225  // Switch to the next block when the min number of errors is reached
226  if (min_error_left_in_block == 0)
227  {
228  uint8_t const block_id2 = std::min<uint8_t>(block_id + 1, search.blocks() - 1);
229  bool const go_right2 = search.pi[block_id2] > search.pi[block_id2 - 1];
230 
231  if (search_ss<abort_on_hit>(cur, query, lb, rb, errors_spent, block_id2, go_right2, search, blocks_length,
232  error_left, delegate) && abort_on_hit)
233  {
234  return true;
235  }
236  }
237 
238  // Insert deletions into the current block as long as possible
239  // Do not allow deletions at the beginning of the leftmost block
240  // Do not allow deletions at the end of the rightmost block
241  if (!(search.pi[block_id] == 1 && !go_right) &&
242  !(search.pi[block_id] == search.blocks() && go_right) &&
243  max_error_left_in_block > 0 && error_left.total > 0 && error_left.deletion > 0 &&
244  ((go_right && cur.extend_right()) || (!go_right && cur.extend_left())))
245  {
246  search_param error_left2{error_left};
247  error_left2.total--;
248  error_left2.deletion--;
249  do
250  {
251  if (search_ss_deletion<abort_on_hit>(cur, query, lb, rb, errors_spent + 1, block_id, go_right, search,
252  blocks_length, error_left2, delegate) && abort_on_hit)
253  {
254  return true;
255  }
256  } while ((go_right && cur.cycle_back()) || (!go_right && cur.cycle_front()));
257  }
258  return false;
259 }
260 
268 template <bool abort_on_hit, typename cursor_t, typename query_t, typename search_t, typename blocks_length_t,
269  typename delegate_t>
270 inline bool search_ss_children(cursor_t cur, query_t & query,
271  typename cursor_t::size_type const lb, typename cursor_t::size_type const rb,
272  uint8_t const errors_spent, uint8_t const block_id, bool const go_right,
273  uint8_t const min_error_left_in_block, search_t const & search,
274  blocks_length_t const & blocks_length, search_param const error_left,
275  delegate_t && delegate)
276 {
277  using size_type = typename cursor_t::size_type;
278  if ((go_right && cur.extend_right()) || (!go_right && cur.extend_left()))
279  {
280  size_type const chars_left = blocks_length[block_id] - (rb - lb - 1);
281 
282  size_type lb2 = lb - !go_right;
283  size_type rb2 = rb + go_right;
284 
285  do
286  {
287  bool const delta = cur.last_rank() != to_rank(query[(go_right ? rb : lb) - 1]);
288 
289  // skip if there are more min errors left in the current block than characters in the block
290  // i.e. chars_left - 1 < min_error_left_in_block - delta
291  // TODO: move that outside the if / do-while struct
292  // TODO: incorporate error_left.deletion into formula
293  if (error_left.deletion == 0 && chars_left + delta < min_error_left_in_block + 1u)
294  continue;
295 
296  if (!delta || error_left.substitution > 0)
297  {
298  search_param error_left2{error_left};
299  error_left2.total -= delta;
300  error_left2.substitution -= delta;
301 
302  // At the end of the current block
303  if (rb - lb == blocks_length[block_id])
304  {
305  // Leave the possibility for one or multiple deletions at the end of a block.
306  // Thus do not change the direction (go_right) yet.
307  if (error_left.deletion > 0)
308  {
309  if (search_ss_deletion<abort_on_hit>(cur, query, lb2, rb2, errors_spent + delta, block_id,
310  go_right, search, blocks_length, error_left2, delegate) &&
311  abort_on_hit)
312  {
313  return true;
314  }
315  }
316  else
317  {
318  uint8_t const block_id2 = std::min<uint8_t>(block_id + 1, search.blocks() - 1);
319  bool const go_right2 = search.pi[block_id2] > search.pi[block_id2 - 1];
320 
321  if (search_ss<abort_on_hit>(cur, query, lb2, rb2, errors_spent + delta, block_id2, go_right2,
322  search, blocks_length, error_left2, delegate) &&
323  abort_on_hit)
324  {
325  return true;
326  }
327  }
328  }
329  else
330  {
331  if (search_ss<abort_on_hit>(cur, query, lb2, rb2, errors_spent + delta, block_id, go_right, search,
332  blocks_length, error_left2, delegate) && abort_on_hit)
333  {
334  return true;
335  }
336  }
337  }
338 
339  // Deletion
340  // TODO: check whether the conditions for deletions at the beginning/end of the query are really necessary
341  // No deletion at the beginning of the leftmost block.
342  // No deletion at the end of the rightmost block.
343  if (error_left.deletion > 0 &&
344  !(go_right && (rb == 1 || rb == std::ranges::size(query) + 1)) &&
345  !(!go_right && (lb == 0 || lb == std::ranges::size(query))))
346  {
347  search_param error_left3{error_left};
348  error_left3.total--;
349  error_left3.deletion--;
350  search_ss<abort_on_hit>(cur, query, lb, rb, errors_spent + 1, block_id, go_right, search, blocks_length,
351  error_left3, delegate);
352  }
353  } while ((go_right && cur.cycle_back()) || (!go_right && cur.cycle_front()));
354  }
355  return false;
356 }
357 
362 template <bool abort_on_hit, typename cursor_t, typename query_t, typename search_t,
363  typename blocks_length_t, typename delegate_t>
364 inline bool search_ss(cursor_t cur, query_t & query,
365  typename cursor_t::size_type const lb, typename cursor_t::size_type const rb,
366  uint8_t const errors_spent, uint8_t const block_id, bool const go_right, search_t const & search,
367  blocks_length_t const & blocks_length, search_param const error_left, delegate_t && delegate)
368 {
369  uint8_t const max_error_left_in_block = search.u[block_id] - errors_spent;
370  uint8_t const min_error_left_in_block = std::max(search.l[block_id] - errors_spent, 0); // NOTE: changed
371 
372  // Done.
373  if (min_error_left_in_block == 0 && lb == 0 && rb == std::ranges::size(query) + 1)
374  {
375  delegate(cur);
376  return true;
377  }
378  // Exact search in current block.
379  else if (((max_error_left_in_block == 0) && (rb - lb - 1 != blocks_length[block_id])) ||
380  (error_left.total == 0 && min_error_left_in_block == 0))
381  {
382  if (search_ss_exact<abort_on_hit>(cur, query, lb, rb, errors_spent, block_id, go_right, search, blocks_length,
383  error_left, delegate) && abort_on_hit)
384  {
385  return true;
386  }
387  }
388  // Approximate search in current block.
389  // i.e. blocks_length[block_id] - (rb - lb - (lb != rb)) >= min_error_left_in_block
390  else if (error_left.total > 0)
391  {
392  // Insertion
393  if (error_left.insertion > 0)
394  {
395  using size_type = typename cursor_t::size_type;
396 
397  size_type const lb2 = lb - !go_right;
398  size_type const rb2 = rb + go_right;
399 
400  search_param error_left2{error_left};
401  error_left2.total--;
402  error_left2.insertion--;
403  // At the end of the current block
404  if (rb - lb == blocks_length[block_id])
405  {
406  // Leave the possibility for one or multiple deletions at the end of a block.
407  // Thus do not change the direction (go_right) yet.
408  // TODO: benchmark the improvement on preventing insertions followed by a deletion and vice versa. Does
409  // it pay off the additional complexity and documentation for the user? (Note that the user might only
410  // allow for insertions and deletion and not for mismatches).
411  if (search_ss_deletion<abort_on_hit>(cur, query, lb2, rb2, errors_spent + 1, block_id, go_right, search,
412  blocks_length, error_left2, delegate) && abort_on_hit)
413  {
414  return true;
415  }
416  }
417  else
418  {
419  if (search_ss<abort_on_hit>(cur, query, lb2, rb2, errors_spent + 1, block_id, go_right, search,
420  blocks_length, error_left2, delegate) && abort_on_hit)
421  {
422  return true;
423  }
424  }
425  }
426  if (search_ss_children<abort_on_hit>(cur, query, lb, rb, errors_spent, block_id, go_right,
427  min_error_left_in_block, search, blocks_length, error_left, delegate) &&
428  abort_on_hit)
429  {
430  return true;
431  }
432  }
433  return false;
434 }
435 
457 template <bool abort_on_hit, typename index_t, typename query_t, typename search_scheme_t, typename delegate_t>
458 inline void search_ss(index_t const & index, query_t & query, search_param const error_left,
459  search_scheme_t const & search_scheme, delegate_t && delegate)
460 {
461  // retrieve cumulative block lengths and starting position
462  auto const block_info = search_scheme_block_info(search_scheme, std::ranges::size(query));
463 
464  for (uint8_t search_id = 0; search_id < search_scheme.size(); ++search_id)
465  {
466  auto const & search = search_scheme[search_id];
467  auto const & [blocks_length, start_pos] = block_info[search_id];
468 
469  bool const hit = search_ss<abort_on_hit>(
470  index.begin(), // cursor on the index
471  query, // query to be searched
472  start_pos, start_pos + 1, // infix range already searched (open interval)
473  // the first character of `query` has the index 1 (not 0)
474  0, // errors spent
475  0, // current block id in search scheme
476  true, // search the first block from left to right
477  search, blocks_length, // search scheme information
478  error_left, // errors left (broken down by error types)
479  delegate // delegate function called on hit
480  );
481 
482  if (abort_on_hit && hit)
483  return;
484  }
485 }
486 
506 template <bool abort_on_hit, typename index_t, typename query_t, typename delegate_t>
507 inline void search_algo_bi(index_t const & index, query_t & query, search_param const error_left,
508  delegate_t && delegate)
509 {
510  switch (error_left.total)
511  {
512  case 0:
513  search_ss<abort_on_hit>(index, query, error_left, optimum_search_scheme<0, 0>, delegate);
514  break;
515  case 1:
516  search_ss<abort_on_hit>(index, query, error_left, optimum_search_scheme<0, 1>, delegate);
517  break;
518  case 2:
519  search_ss<abort_on_hit>(index, query, error_left, optimum_search_scheme<0, 2>, delegate);
520  break;
521  case 3:
522  search_ss<abort_on_hit>(index, query, error_left, optimum_search_scheme<0, 3>, delegate);
523  break;
524  default:
525  auto const & search_scheme{compute_ss(0, error_left.total)};
526  search_ss<abort_on_hit>(index, query, error_left, search_scheme, delegate);
527  break;
528  }
529 }
530 
535 template <bool abort_on_hit, typename index_t, typename query_t, typename delegate_t>
536 inline void search_algo_uni(index_t const & index, query_t & query, search_param const error_left,
537  delegate_t && delegate)
538 {
539  search_trivial<abort_on_hit>(index, query, error_left, delegate);
540 }
541 
546 template <bool abort_on_hit, typename index_t, typename query_t, typename delegate_t>
547 inline void search_algo(index_t const & index, query_t & query, search_param const error_left, delegate_t && delegate)
548 {
550  search_algo_bi<abort_on_hit>(index, query, error_left, delegate);
551  else
552  search_algo_uni<abort_on_hit>(index, query, error_left, delegate);
553 }
554 
556 
557 } // namespace seqan3::detail
std::false_type
seqan3::search
auto search(queries_t &&queries, index_t const &index, configuration_t const &cfg=search_cfg::default_configuration)
Search a query or a range of queries in an index.
Definition: search.hpp:167
search_scheme_precomputed.hpp
Provides the data structures and precomputed instances for (optimum) search schemes.
std::vector
seqan3::to_rank
constexpr auto to_rank
Return the rank representation of a (semi-)alphabet object.
Definition: concept.hpp:142
same_as
The concept std::same_as<T, U> is satisfied if and only if T and U denote the same type.
slice.hpp
Provides seqan3::views::slice.
transformation_trait_or.hpp
Provides seqan3::detail::transformation_trait_or.
std::array
search_trivial.hpp
Provides an approximate string matching algorithm based on simple backtracking. This should only be u...
seqan3::pack_traits::size
constexpr size_t size
The size of a type pack.
Definition: traits.hpp:116
bi_fm_index_specialisation
Concept for bidirectional FM indices.
seqan3::views::slice
constexpr auto slice
A view adaptor that returns a half-open interval on the underlying range.
Definition: slice.hpp:141
concept.hpp
Provides the concepts for seqan3::fm_index and seqan3::bi_fm_index and its traits and cursors.
search_common.hpp
Provides data structures used by different search algorithms.
std::conditional_t
std::max
T max(T... args)