SeqAn3  3.0.2
The Modern C++ library for sequence analysis.
fm_index_cursor.hpp
Go to the documentation of this file.
1 // -----------------------------------------------------------------------------------------------------
2 // Copyright (c) 2006-2020, Knut Reinert & Freie Universität Berlin
3 // Copyright (c) 2016-2020, Knut Reinert & MPI für molekulare Genetik
4 // This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5 // shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6 // -----------------------------------------------------------------------------------------------------
7 
13 #pragma once
14 
15 #include <array>
16 #include <seqan3/std/ranges>
17 #include <type_traits>
18 
19 #include <sdsl/suffix_trees.hpp>
20 
27 
28 namespace seqan3
29 {
30 
55 template <typename index_t>
57 {
58 public:
62  using index_type = index_t;
65  using size_type = typename index_type::size_type;
67 
68 private:
72  using node_type = detail::fm_index_cursor_node<index_t>;
75  using sdsl_char_type = typename index_type::sdsl_char_type;
77  using sdsl_index_type = typename index_t::sdsl_index_type;
79  using sdsl_sigma_type = typename index_type::sdsl_sigma_type;
81  using index_alphabet_type = typename index_t::alphabet_type;
87 
89  index_type const * index{nullptr};
91  size_type parent_lb{};
93  size_type parent_rb{};
95  node_type node{};
97  sdsl_sigma_type sigma{};
98 
99  template <typename _index_t>
100  friend class bi_fm_index_cursor;
101 
103  size_type offset() const noexcept
104  {
105  assert(index->index.size() > query_length());
106  return index->index.size() - query_length() - 1; // since the string is reversed during construction
107  }
108 
110  bool backward_search(sdsl_index_type const & csa,
111  sdsl_char_type const c,
112  size_type & l,
113  size_type & r) const noexcept
114  {
115  assert(l <= r && r < csa.size());
116 
117  size_type _l, _r;
118 
119  size_type cc = c;
120  if constexpr(!std::same_as<index_alphabet_type, sdsl::plain_byte_alphabet>)
121  {
122  cc = csa.char2comp[c];
123  if (cc == 0 && c > 0) // [[unlikely]]
124  return false;
125  }
126 
127  size_type const c_begin = csa.C[cc];
128  if (l == 0 && r + 1 == csa.size()) // [[unlikely]]
129  {
130  _l = c_begin;
131  _r = csa.C[cc + 1] - 1;
132  // if we use not the plain_byte_alphabet, we could return always return true here
133  }
134  else
135  {
136  _l = c_begin + csa.bwt.rank(l, c); // count c in bwt[0..l-1]
137  _r = c_begin + csa.bwt.rank(r + 1, c) - 1; // count c in bwt[0..r]
138  }
139 
140  if (_r >= _l)
141  {
142  r = _r;
143  l = _l;
144  assert(r + 1 - l >= 0);
145  return true;
146  }
147  return false;
148  }
149 
150 public:
151 
155  // Default construction is necessary to make this class semi-regular and e.g., to allow construction of
157  // std::array of iterators.
158  fm_index_cursor() noexcept = default;
159  fm_index_cursor(fm_index_cursor const &) noexcept = default;
160  fm_index_cursor & operator=(fm_index_cursor const &) noexcept = default;
161  fm_index_cursor(fm_index_cursor &&) noexcept = default;
162  fm_index_cursor & operator=(fm_index_cursor &&) noexcept = default;
163  ~fm_index_cursor() = default;
164 
166  fm_index_cursor(index_t const & _index) noexcept :
167  index(&_index),
168  node({0, _index.index.size() - 1, 0, 0}),
169  sigma(_index.index.sigma - index_t::text_layout_mode)
170  {}
171  //\}
172 
185  bool operator==(fm_index_cursor const & rhs) const noexcept
186  {
187  assert(index != nullptr);
188  assert(node != rhs.node || (query_length() == 0 || (parent_lb == rhs.parent_lb && parent_rb == rhs.parent_rb)));
189 
190  // position in the implicit suffix tree is defined by the SA interval and depth.
191  // No need to compare parent intervals
192  return node == rhs.node;
193  }
194 
207  bool operator!=(fm_index_cursor const & rhs) const noexcept
208  {
209  assert(index != nullptr);
210 
211  return !(*this == rhs);
212  }
213 
231  bool extend_right() noexcept
232  {
233  // TODO: specialize extend_right() and cycle_back() for EPR-dictionaries
234  // store all cursors at once in a private std::array of cursors
235  assert(index != nullptr);
236 
237  sdsl_char_type c = 1; // NOTE: start with 0 or 1 depending on implicit_sentintel
238  size_type _lb = node.lb, _rb = node.rb;
239  while (c < sigma && !backward_search(index->index, index->index.comp2char[c], _lb, _rb))
240  {
241  ++c;
242  }
243 
244  if (c != sigma)
245  {
246  parent_lb = node.lb;
247  parent_rb = node.rb;
248  node = {_lb, _rb, node.depth + 1, c};
249  return true;
250  }
251  return false;
252  }
253 
267  template <typename char_t>
269  requires std::convertible_to<char_t, index_alphabet_type>
271  bool extend_right(char_t const c) noexcept
272  {
273  assert(index != nullptr);
274  // The rank cannot exceed 255 for single text and 254 for text collections as they are reserved as sentinels
275  // for the indexed text.
276  assert(seqan3::to_rank(static_cast<index_alphabet_type>(c)) <
277  ((index_type::text_layout_mode == text_layout::single) ? 255 : 254));
278 
279  size_type _lb = node.lb, _rb = node.rb;
280 
281  sdsl_char_type c_char = seqan3::to_rank(static_cast<index_alphabet_type>(c)) + 1;
282 
283  if (backward_search(index->index, c_char, _lb, _rb))
284  {
285  parent_lb = node.lb;
286  parent_rb = node.rb;
287  node = {_lb, _rb, node.depth + 1, c_char};
288  return true;
289  }
290  return false;
291  }
292 
294  template <typename char_type>
296  requires detail::is_char_adaptation_v<char_type>
298  bool extend_right(char_type const * cstring) noexcept
299  {
301  }
302 
319  template <std::ranges::range seq_t>
320  bool extend_right(seq_t && seq) noexcept
321  {
322  static_assert(std::ranges::forward_range<seq_t>, "The query must model forward_range.");
323  static_assert(std::convertible_to<range_innermost_value_t<seq_t>, index_alphabet_type>,
324  "The alphabet of the sequence must be convertible to the alphabet of the index.");
325 
326  assert(index != nullptr); // range must not be empty!
327 
328  size_type _lb = node.lb, _rb = node.rb;
329  size_type new_parent_lb = parent_lb, new_parent_rb = parent_rb;
330 
331  sdsl_char_type c{};
332  size_t len{0};
333 
334  for (auto it = std::ranges::begin(seq); it != std::ranges::end(seq); ++len, ++it)
335  {
336  // The rank cannot exceed 255 for single text and 254 for text collections as they are reserved as sentinels
337  // for the indexed text.
338  assert(seqan3::to_rank(static_cast<index_alphabet_type>(*it)) <
339  ((index_type::text_layout_mode == text_layout::single) ? 255 : 254));
340 
341  c = seqan3::to_rank(static_cast<index_alphabet_type>(*it)) + 1;
342 
343  new_parent_lb = _lb;
344  new_parent_rb = _rb;
345  if (!backward_search(index->index, c, _lb, _rb))
346  return false;
347  }
348 
349  parent_lb = new_parent_lb;
350  parent_rb = new_parent_rb;
351  node = {_lb, _rb, len + node.depth, c};
352  return true;
353  }
354 
381  bool cycle_back() noexcept
382  {
383  assert(index != nullptr && query_length() > 0);
384  // parent_lb > parent_rb --> invalid interval
385  assert(parent_lb <= parent_rb);
386 
387  sdsl_char_type c = node.last_char + 1;
388  size_type _lb = parent_lb, _rb = parent_rb;
389 
390  while (c < sigma && !backward_search(index->index, index->index.comp2char[c], _lb, _rb))
391  {
392  ++c;
393  }
394 
395  if (c != sigma) // Collection has additional sentinel as delimiter
396  {
397  node = {_lb, _rb, node.depth, c};
398  return true;
399  }
400  return false;
401  }
402 
418  size_type last_rank() const noexcept
419  {
420  // parent_lb > parent_rb --> invalid interval
421  assert(index != nullptr && query_length() > 0 && parent_lb <= parent_rb);
422 
423  return index->index.comp2char[node.last_char] - 1; // text is not allowed to contain ranks of 0
424  }
425 
440  size_type query_length() const noexcept
441  {
442  assert(index != nullptr);
443  assert(node.depth != 0 || (node.lb == 0 && node.rb == index->size() - 1)); // depth == 0 -> root node
444 
445  return node.depth;
446  }
447 
465  template <std::ranges::range text_t>
466  auto path_label(text_t && text) const noexcept
468  requires (index_t::text_layout_mode == text_layout::single)
470  {
471  static_assert(std::ranges::input_range<text_t>, "The text must model input_range.");
472  static_assert(range_dimension_v<text_t> == 1, "The input cannot be a text collection.");
473  static_assert(std::same_as<range_innermost_value_t<text_t>, index_alphabet_type>,
474  "The alphabet types of the given text and index differ.");
475  assert(index != nullptr);
476 
477  size_type const query_begin = offset() - index->index[node.lb];
478  return text | views::slice(query_begin, query_begin + query_length());
479  }
480 
482  template <std::ranges::range text_t>
483  auto path_label(text_t && text) const noexcept
485  requires (index_t::text_layout_mode == text_layout::collection)
487  {
488  static_assert(std::ranges::input_range<text_t>, "The text collection must model input_range.");
489  static_assert(range_dimension_v<text_t> == 2, "The input must be a text collection.");
490  static_assert(std::same_as<range_innermost_value_t<text_t>, index_alphabet_type>,
491  "The alphabet types of the given text and index differ.");
492  assert(index != nullptr);
493 
494  // Position of query in concatenated text.
495  size_type const location = offset() - index->index[node.lb];
496 
497  // The rank represents the number of start positions of the individual sequences/texts in the collection
498  // before position `location + 1` and thereby also the number of delimiters.
499  size_type const rank = index->text_begin_rs.rank(location + 1);
500  assert(rank > 0);
501  size_type const text_id = rank - 1;
502 
503  // The start location of the `text_id`-th text in the sequence (position of the `rank`-th 1 in the bitvector).
504  size_type const start_location = index->text_begin_ss.select(rank);
505  // Substract lengths of previous sequences.
506  size_type const query_begin = location - start_location;
507 
508  // Take subtext, slice query out of it
509  return text[text_id] | views::slice(query_begin, query_begin + query_length());
510  }
511 
523  size_type count() const noexcept
524  {
525  assert(index != nullptr);
526 
527  return 1 + node.rb - node.lb;
528  }
529 
543  requires (index_t::text_layout_mode == text_layout::single)
545  {
546  assert(index != nullptr);
547 
548  locate_result_type occ{};
549  occ.reserve(count());
550  for (size_type i = 0; i < count(); ++i)
551  {
552  occ.emplace_back(0, offset() - index->index[node.lb + i]);
553  }
554 
555  return occ;
556  }
557 
561  requires (index_t::text_layout_mode == text_layout::collection)
563  {
564  assert(index != nullptr);
565 
566  locate_result_type occ;
567  occ.reserve(count());
568  for (size_type i = 0; i < count(); ++i)
569  {
570  size_type loc = offset() - index->index[node.lb + i];
571  size_type sequence_rank = index->text_begin_rs.rank(loc + 1);
572  size_type sequence_position = loc - index->text_begin_ss.select(sequence_rank);
573  occ.emplace_back(sequence_rank - 1, sequence_position);
574  }
575  return occ;
576  }
577 
590  auto lazy_locate() const
592  requires (index_t::text_layout_mode == text_layout::single)
594  {
595  assert(index != nullptr);
596 
597  return std::views::iota(node.lb, node.lb + count())
598  | std::views::transform([*this, _offset = offset()] (auto sa_pos)
599  {
600  return locate_result_value_type{0u, _offset - index->index[sa_pos]};
601  });
602  }
603 
605  auto lazy_locate() const
607  requires (index_t::text_layout_mode == text_layout::collection)
609  {
610  assert(index != nullptr);
611 
612  return std::views::iota(node.lb, node.lb + count())
613  | std::views::transform([*this, _offset = offset()] (auto sa_pos)
614  {
615  return _offset - index->index[sa_pos];
616  })
617  | std::views::transform([*this] (auto loc)
618  {
619  size_type sequence_rank = index->text_begin_rs.rank(loc + 1);
620  size_type sequence_position = loc - index->text_begin_ss.select(sequence_rank);
621  return locate_result_value_type{sequence_rank - 1, sequence_position};
622  });
623  }
624 };
625 
627 
628 } // namespace seqan3
seqan3::single
@ single
The text is a single range.
Definition: concept.hpp:84
std::basic_string_view
std::pair
std::vector::reserve
T reserve(T... args)
std::vector
seqan3::fm_index_cursor::count
size_type count() const noexcept
Counts the number of occurrences of the searched query in the text.
Definition: fm_index_cursor.hpp:523
seqan3::range_innermost_value_t
typename range_innermost_value< t >::type range_innermost_value_t
Shortcut for seqan3::range_innermost_value (transformation_trait shortcut).
Definition: range.hpp:226
fm_index_cursor.hpp
Provides the internal representation of a node of the seqan3::fm_index_cursor.
seqan3::fm_index_cursor::size_type
typename index_type::size_type size_type
Type for representing positions in the indexed text.
Definition: fm_index_cursor.hpp:65
seqan3::to_rank
constexpr auto to_rank
Return the rank representation of a (semi-)alphabet object.
Definition: concept.hpp:143
seqan3::fm_index_cursor::operator==
bool operator==(fm_index_cursor const &rhs) const noexcept
Compares two cursors.
Definition: fm_index_cursor.hpp:185
seqan3::fm_index_cursor::last_rank
size_type last_rank() const noexcept
Outputs the rightmost rank.
Definition: fm_index_cursor.hpp:418
seqan3::fm_index_cursor::extend_right
bool extend_right(char_type const *cstring) noexcept
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition: fm_index_cursor.hpp:298
seqan3::fm_index_cursor::cycle_back
bool cycle_back() noexcept
Tries to replace the rightmost character of the query by the next lexicographically larger character ...
Definition: fm_index_cursor.hpp:381
seqan3::seq
constexpr sequenced_policy seq
Global execution policy object for sequenced execution policy.
Definition: execution.hpp:54
seqan3::text_layout
text_layout
The possible text layouts (single, collection) the seqan3::fm_index and seqan3::bi_fm_index can suppo...
Definition: concept.hpp:82
seqan3::fm_index_cursor::extend_right
bool extend_right(seq_t &&seq) noexcept
Tries to extend the query by seq to the right.
Definition: fm_index_cursor.hpp:320
seqan3::collection
@ collection
The text is a range of ranges.
Definition: concept.hpp:86
seqan3::fm_index_cursor::locate
locate_result_type locate() const
Locates the occurrences of the searched query in the text.
Definition: fm_index_cursor.hpp:541
slice.hpp
Provides seqan3::views::slice.
seqan3::fm_index_cursor::query_length
size_type query_length() const noexcept
Returns the length of the searched query.
Definition: fm_index_cursor.hpp:440
range.hpp
Provides various transformation traits used by the range module.
array
seqan3
The main SeqAn3 namespace.
Definition: aligned_sequence_concept.hpp:29
seqan3::fm_index_cursor::extend_right
bool extend_right(char_t const c) noexcept
Tries to extend the query by the character c to the right.
Definition: fm_index_cursor.hpp:271
seqan3::fm_index_cursor::operator!=
bool operator!=(fm_index_cursor const &rhs) const noexcept
Compares two cursors.
Definition: fm_index_cursor.hpp:207
seqan3::fm_index_cursor::path_label
auto path_label(text_t &&text) const noexcept
Returns the searched query.
Definition: fm_index_cursor.hpp:466
seqan3::size_type
Exposes the size_type of another type.
Definition: pre.hpp:306
char.hpp
Provides alphabet adaptations for standard char types.
ranges
Adaptations of concepts from the Ranges TS.
std::vector::emplace_back
T emplace_back(T... args)
seqan3::views::slice
constexpr auto slice
A view adaptor that returns a half-open interval on the underlying range.
Definition: slice.hpp:141
seqan3::fm_index_cursor
The SeqAn FM Index Cursor.
Definition: fm_index_cursor.hpp:57
seqan3::fm_index_cursor::extend_right
bool extend_right() noexcept
Tries to extend the query by the smallest possible character to the right such that the query is foun...
Definition: fm_index_cursor.hpp:231
seqan3::pack_traits::transform
seqan3::type_list< trait_t< pack_t >... > transform
Apply a transformation trait to every type in the pack and return a seqan3::type_list of the results.
Definition: traits.hpp:307
seqan3::fm_index_cursor::lazy_locate
auto lazy_locate() const
Locates the occurrences of the searched query in the text on demand, i.e. a std::ranges::view is retu...
Definition: fm_index_cursor.hpp:590
seqan3::fm_index_cursor::index_type
index_t index_type
Type of the index.
Definition: fm_index_cursor.hpp:63
fm_index.hpp
Provides the unidirectional seqan3::fm_index.
seqan3::fm_index_cursor::fm_index_cursor
fm_index_cursor() noexcept=default
Default constructor. Accessing member functions on a default constructed object is undefined behavior...
concept.hpp
Core alphabet concept and free function/type trait wrappers.