SeqAn3  3.0.1
The Modern C++ library for sequence analysis.
fm_index_cursor.hpp
Go to the documentation of this file.
1 // -----------------------------------------------------------------------------------------------------
2 // Copyright (c) 2006-2020, Knut Reinert & Freie Universität Berlin
3 // Copyright (c) 2016-2020, Knut Reinert & MPI für molekulare Genetik
4 // This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5 // shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6 // -----------------------------------------------------------------------------------------------------
7 
13 #pragma once
14 
15 #include <array>
16 #include <type_traits>
17 
18 #include <sdsl/suffix_trees.hpp>
19 
20 #include <range/v3/view/slice.hpp>
21 
29 #include <seqan3/std/ranges>
30 
31 namespace seqan3
32 {
33 
58 template <typename index_t>
60 {
61 public:
62 
66  using index_type = index_t;
69  using size_type = typename index_type::size_type;
71 
72 private:
74 
78  using node_type = detail::fm_index_cursor_node<index_t>;
81  using sdsl_char_type = typename index_type::sdsl_char_type;
83  using sdsl_sigma_type = typename index_type::sdsl_sigma_type;
85  using index_alphabet_type = typename index_t::alphabet_type;
87 
89  index_type const * index;
91  size_type parent_lb;
93  size_type parent_rb;
95  node_type node;
97  sdsl_sigma_type sigma;
98 
99  template <typename _index_t>
100  friend class bi_fm_index_cursor;
101 
103  size_type offset() const noexcept
104  {
105  assert(index->index.size() > query_length());
106  return index->index.size() - query_length() - 1; // since the string is reversed during construction
107  }
108 
110  template <detail::sdsl_index csa_t>
111  bool backward_search(csa_t const & csa, sdsl_char_type const c, size_type & l, size_type & r) const noexcept
112  {
113  assert(l <= r && r < csa.size());
114 
115  size_type _l, _r;
116 
117  size_type cc = c;
119  {
120  cc = csa.char2comp[c];
121  if (cc == 0 && c > 0) // [[unlikely]]
122  return false;
123  }
124 
125  size_type const c_begin = csa.C[cc];
126  if (l == 0 && r + 1 == csa.size()) // [[unlikely]]
127  {
128  _l = c_begin;
129  _r = csa.C[cc + 1] - 1;
130  // if we use not the plain_byte_alphabet, we could return always return true here
131  }
132  else
133  {
134  _l = c_begin + csa.bwt.rank(l, c); // count c in bwt[0..l-1]
135  _r = c_begin + csa.bwt.rank(r + 1, c) - 1; // count c in bwt[0..r]
136  }
137 
138  if (_r >= _l)
139  {
140  r = _r;
141  l = _l;
142  assert(r + 1 - l >= 0);
143  return true;
144  }
145  return false;
146  }
147 
148 public:
149 
153  // Default construction is necessary to make this class semi-regular and e.g., to allow construction of
155  // std::array of iterators.
156  fm_index_cursor() noexcept = default;
157  fm_index_cursor(fm_index_cursor const &) noexcept = default;
158  fm_index_cursor & operator=(fm_index_cursor const &) noexcept = default;
159  fm_index_cursor(fm_index_cursor &&) noexcept = default;
160  fm_index_cursor & operator=(fm_index_cursor &&) noexcept = default;
161  ~fm_index_cursor() = default;
162 
164  fm_index_cursor(index_t const & _index) noexcept : index(&_index), node({0, _index.index.size() - 1, 0, 0}),
165  sigma(_index.index.sigma - index_t::text_layout_mode)
166  {}
167  //\}
168 
181  bool operator==(fm_index_cursor const & rhs) const noexcept
182  {
183  assert(index != nullptr);
184  assert(node != rhs.node || (query_length() == 0 || (parent_lb == rhs.parent_lb && parent_rb == rhs.parent_rb)));
185 
186  // position in the implicit suffix tree is defined by the SA interval and depth.
187  // No need to compare parent intervals
188  return node == rhs.node;
189  }
190 
203  bool operator!=(fm_index_cursor const & rhs) const noexcept
204  {
205  assert(index != nullptr);
206 
207  return !(*this == rhs);
208  }
209 
227  bool extend_right() noexcept
228  {
229  // TODO: specialize extend_right() and cycle_back() for EPR-dictionaries
230  // store all cursors at once in a private std::array of cursors
231  assert(index != nullptr);
232 
233  sdsl_char_type c = 1; // NOTE: start with 0 or 1 depending on implicit_sentintel
234  size_type _lb = node.lb, _rb = node.rb;
235  while (c < sigma && !backward_search(index->index, index->index.comp2char[c], _lb, _rb))
236  {
237  ++c;
238  }
239 
240  if (c != sigma)
241  {
242  parent_lb = node.lb;
243  parent_rb = node.rb;
244  node = {_lb, _rb, node.depth + 1, c};
245  return true;
246  }
247  return false;
248  }
249 
263  template <typename char_t>
264  bool extend_right(char_t const c) noexcept
265  {
267  "The character must be convertible to the alphabet of the index.");
268 
269  assert(index != nullptr);
270 
271  size_type _lb = node.lb, _rb = node.rb;
272 
273  sdsl_char_type c_char = to_rank(static_cast<index_alphabet_type>(c)) + 1;
274 
275  if (backward_search(index->index, c_char, _lb, _rb))
276  {
277  parent_lb = node.lb;
278  parent_rb = node.rb;
279  node = {_lb, _rb, node.depth + 1, c_char};
280  return true;
281  }
282  return false;
283  }
284 
301  template <std::ranges::range seq_t>
302  bool extend_right(seq_t && seq) noexcept
303  {
304  static_assert(std::ranges::forward_range<seq_t>, "The query must model forward_range.");
305  static_assert(std::convertible_to<innermost_value_type_t<seq_t>, index_alphabet_type>,
306  "The alphabet of the sequence must be convertible to the alphabet of the index.");
307 
308  assert(index != nullptr); // range must not be empty!
309 
310  size_type _lb = node.lb, _rb = node.rb;
311  size_type new_parent_lb = parent_lb, new_parent_rb = parent_rb;
312 
313  sdsl_char_type c{};
314  size_t len{0};
315 
316  for (auto it = std::ranges::begin(seq); it != std::ranges::end(seq); ++len, ++it)
317  {
318  c = to_rank(static_cast<index_alphabet_type>(*it)) + 1;
319 
320  new_parent_lb = _lb;
321  new_parent_rb = _rb;
322  if (!backward_search(index->index, c, _lb, _rb))
323  return false;
324  }
325 
326  parent_lb = new_parent_lb;
327  parent_rb = new_parent_rb;
328  node = {_lb, _rb, len + node.depth, c};
329  return true;
330  }
331 
358  bool cycle_back() noexcept
359  {
360  assert(index != nullptr && query_length() > 0);
361  // parent_lb > parent_rb --> invalid interval
362  assert(parent_lb <= parent_rb);
363 
364  sdsl_char_type c = node.last_char + 1;
365  size_type _lb = parent_lb, _rb = parent_rb;
366 
367  while (c < sigma && !backward_search(index->index, index->index.comp2char[c], _lb, _rb))
368  {
369  ++c;
370  }
371 
372  if (c != sigma) // Collection has additional sentinel as delimiter
373  {
374  node = {_lb, _rb, node.depth, c};
375  return true;
376  }
377  return false;
378  }
379 
395  size_type last_rank() const noexcept
396  {
397  // parent_lb > parent_rb --> invalid interval
398  assert(index != nullptr && query_length() > 0 && parent_lb <= parent_rb);
399 
400  return index->index.comp2char[node.last_char] - 1; // text is not allowed to contain ranks of 0
401  }
402 
417  size_type query_length() const noexcept
418  {
419  assert(index != nullptr);
420  assert(node.depth != 0 || (node.lb == 0 && node.rb == index->size() - 1)); // depth == 0 -> root node
421 
422  return node.depth;
423  }
424 
442  template <std::ranges::range text_t>
443  auto path_label(text_t && text) const noexcept
445  requires index_t::text_layout_mode == text_layout::single
447  {
448  static_assert(std::ranges::input_range<text_t>, "The text must model input_range.");
449  static_assert(dimension_v<text_t> == 1, "The input cannot be a text collection.");
450  static_assert(std::same_as<innermost_value_type_t<text_t>, index_alphabet_type>,
451  "The alphabet types of the given text and index differ.");
452  assert(index != nullptr);
453 
454  size_type const query_begin = offset() - index->index[node.lb];
455  return text | views::slice(query_begin, query_begin + query_length());
456  }
457 
459  template <std::ranges::range text_t>
460  auto path_label(text_t && text) const noexcept
462  requires index_t::text_layout_mode == text_layout::collection
464  {
465  static_assert(std::ranges::input_range<text_t>, "The text collection must model input_range.");
466  static_assert(dimension_v<text_t> == 2, "The input must be a text collection.");
467  static_assert(std::same_as<innermost_value_type_t<text_t>, index_alphabet_type>,
468  "The alphabet types of the given text and index differ.");
469  assert(index != nullptr);
470 
471  size_type const loc = offset() - index->index[node.lb];
472  size_type const query_begin = loc - index->text_begin_rs.rank(loc + 1) + 1; // Substract delimiters
473  return text | views::join | views::slice(query_begin, query_begin + query_length());
474  }
475 
487  size_type count() const noexcept
488  {
489  assert(index != nullptr);
490 
491  return 1 + node.rb - node.lb;
492  }
493 
507  requires index_t::text_layout_mode == text_layout::single
509  {
510  assert(index != nullptr);
511 
513  for (size_type i = 0; i < occ.size(); ++i)
514  {
515  occ[i] = offset() - index->index[node.lb + i];
516  }
517  return occ;
518  }
519 
523  requires index_t::text_layout_mode == text_layout::collection
525  {
526  assert(index != nullptr);
527 
529  occ.reserve(count());
530  for (size_type i = 0; i < count(); ++i)
531  {
532  size_type loc = offset() - index->index[node.lb + i];
533  size_type sequence_rank = index->text_begin_rs.rank(loc + 1);
534  size_type sequence_position = loc - index->text_begin_ss.select(sequence_rank);
535  occ.emplace_back(sequence_rank - 1, sequence_position);
536  }
537  return occ;
538  }
539 
552  auto lazy_locate() const
554  requires index_t::text_layout_mode == text_layout::single
556  {
557  assert(index != nullptr);
558 
559  return std::views::iota(node.lb, node.lb + count())
560  | std::views::transform([*this, _offset = offset()] (auto sa_pos) { return _offset - index->index[sa_pos]; });
561  }
562 
564  auto lazy_locate() const
566  requires index_t::text_layout_mode == text_layout::collection
568  {
569  assert(index != nullptr);
570 
571  return std::views::iota(node.lb, node.lb + count())
572  | std::views::transform([*this, _offset = offset()] (auto sa_pos) { return _offset - index->index[sa_pos]; })
573  | std::views::transform([*this] (auto loc)
574  {
575  size_type sequence_rank = index->text_begin_rs.rank(loc + 1);
576  size_type sequence_position = loc - index->text_begin_ss.select(sequence_rank);
577  return std::make_pair(sequence_rank-1, sequence_position);
578  });
579  }
580 
581 };
582 
584 
585 } // namespace seqan3
seqan3::single
The text is a single range.
Definition: concept.hpp:84
seqan3::field::seq
The "sequence", usually a range of nucleotides or amino acids.
std::vector::reserve
T reserve(T... args)
std::vector
std::vector::size
T size(T... args)
seqan3::fm_index_cursor::count
size_type count() const noexcept
Counts the number of occurrences of the searched query in the text.
Definition: fm_index_cursor.hpp:487
fm_index_cursor.hpp
Provides the internal representation of a node of the seqan3::fm_index_cursor.
seqan3::fm_index_cursor::locate
std::vector< std::pair< size_type, size_type > > locate() const
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition: fm_index_cursor.hpp:521
seqan3::fm_index_cursor::size_type
typename index_type::size_type size_type
Type for representing positions in the indexed text.
Definition: fm_index_cursor.hpp:69
seqan3::to_rank
constexpr auto to_rank
Return the rank representation of a (semi-)alphabet object.
Definition: concept.hpp:142
seqan3::fm_index_cursor::operator==
bool operator==(fm_index_cursor const &rhs) const noexcept
Compares two cursors.
Definition: fm_index_cursor.hpp:181
seqan3::fm_index_cursor::last_rank
size_type last_rank() const noexcept
Outputs the rightmost rank.
Definition: fm_index_cursor.hpp:395
seqan3::fm_index_cursor::cycle_back
bool cycle_back() noexcept
Tries to replace the rightmost character of the query by the next lexicographically larger character ...
Definition: fm_index_cursor.hpp:358
seqan3::text_layout
text_layout
The possible text layouts (single, collection) the seqan3::fm_index and seqan3::bi_fm_index can suppo...
Definition: concept.hpp:81
same_as
The concept std::same_as<T, U> is satisfied if and only if T and U denote the same type.
seqan3::fm_index_cursor::extend_right
bool extend_right(seq_t &&seq) noexcept
Tries to extend the query by seq to the right.
Definition: fm_index_cursor.hpp:302
seqan3::collection
The text is a range of ranges.
Definition: concept.hpp:86
slice.hpp
Provides seqan3::views::slice.
seqan3::fm_index_cursor::query_length
size_type query_length() const noexcept
Returns the length of the searched query.
Definition: fm_index_cursor.hpp:417
range.hpp
Provides various transformation traits used by the range module.
array
seqan3
The main SeqAn3 namespace.
Definition: aligned_sequence_concept.hpp:36
convertible_to
The concept std::convertible_to<From, To> specifies that an expression of the type and value category...
seqan3::fm_index_cursor::extend_right
bool extend_right(char_t const c) noexcept
Tries to extend the query by the character c to the right.
Definition: fm_index_cursor.hpp:264
seqan3::fm_index_cursor::operator!=
bool operator!=(fm_index_cursor const &rhs) const noexcept
Compares two cursors.
Definition: fm_index_cursor.hpp:203
seqan3::fm_index_cursor::path_label
auto path_label(text_t &&text) const noexcept
Returns the searched query.
Definition: fm_index_cursor.hpp:443
seqan3::size_type
Exposes the size_type of another type.
Definition: pre.hpp:188
join.hpp
Provides seqan3::views::join.
ranges
Adaptations of concepts from the Ranges TS.
std::vector::emplace_back
T emplace_back(T... args)
seqan3::bi_fm_index_cursor
The SeqAn Bidirectional FM Index Cursor.
Definition: bi_fm_index_cursor.hpp:55
seqan3::fm_index_cursor::locate
std::vector< size_type > locate() const
Locates the occurrences of the searched query in the text.
Definition: fm_index_cursor.hpp:505
seqan3::views::slice
constexpr auto slice
A view adaptor that returns a half-open interval on the underlying range.
Definition: slice.hpp:141
seqan3::innermost_value_type_t
typename innermost_value_type< t >::type innermost_value_type_t
Shortcut for seqan3::innermost_value_type (transformation_trait shortcut).
Definition: range.hpp:191
seqan3::fm_index_cursor
The SeqAn FM Index Cursor.
Definition: fm_index_cursor.hpp:59
std::make_pair
T make_pair(T... args)
seqan3::fm_index_cursor::extend_right
bool extend_right() noexcept
Tries to extend the query by the smallest possible character to the right such that the query is foun...
Definition: fm_index_cursor.hpp:227
seqan3::pack_traits::transform
seqan3::type_list< trait_t< pack_t >... > transform
Apply a transformation trait to every type in the pack and return a seqan3::type_list of the results.
Definition: traits.hpp:307
seqan3::fm_index_cursor::lazy_locate
auto lazy_locate() const
Locates the occurrences of the searched query in the text on demand, i.e. a ranges::view is returned ...
Definition: fm_index_cursor.hpp:552
seqan3::fm_index_cursor::index_type
index_t index_type
Type of the index.
Definition: fm_index_cursor.hpp:67
fm_index.hpp
Provides the unidirectional seqan3::fm_index.
csa_alphabet_strategy.hpp
Provides an alphabet mapping that implements an identity map (i.e. each character is mapped to its ra...
seqan3::fm_index_cursor::fm_index_cursor
fm_index_cursor() noexcept=default
Default constructor. Accessing member functions on a default constructed object is undefined behavior...
concept.hpp
Core alphabet concept and free function/type trait wrappers.