SeqAn3  3.0.3
The Modern C++ library for sequence analysis.
fm_index_cursor.hpp
Go to the documentation of this file.
1 // -----------------------------------------------------------------------------------------------------
2 // Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin
3 // Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik
4 // This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5 // shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6 // -----------------------------------------------------------------------------------------------------
7 
13 #pragma once
14 
15 #include <array>
16 #include <seqan3/std/ranges>
17 #include <type_traits>
18 
19 #include <sdsl/suffix_trees.hpp>
20 
27 
28 namespace seqan3
29 {
30 
37 {
39  size_t begin_position{};
41  size_t end_position{};
42 
51  friend bool operator==(suffix_array_interval const & lhs, suffix_array_interval const & rhs) noexcept
52  {
53  return lhs.begin_position == rhs.begin_position && lhs.end_position == rhs.end_position;
54  }
55 
61  friend bool operator!=(suffix_array_interval const & lhs, suffix_array_interval const & rhs) noexcept
62  {
63  return !(lhs == rhs);
64  }
66 };
67 
88 template <typename index_t>
90 {
91 public:
96  using index_type = index_t;
98  using size_type = typename index_type::size_type;
100 
101 private:
106  using node_type = detail::fm_index_cursor_node<index_t>;
108  using sdsl_char_type = typename index_type::sdsl_char_type;
110  using sdsl_index_type = typename index_t::sdsl_index_type;
112  using sdsl_sigma_type = typename index_type::sdsl_sigma_type;
114  using index_alphabet_type = typename index_t::alphabet_type;
120 
122  index_type const * index{nullptr};
124  size_type parent_lb{};
126  size_type parent_rb{};
128  node_type node{};
130  sdsl_sigma_type sigma{};
131 
132  template <typename _index_t>
133  friend class bi_fm_index_cursor;
134 
136  size_type offset() const noexcept
137  {
138  assert(index->index.size() > query_length());
139  return index->index.size() - query_length() - 1; // since the string is reversed during construction
140  }
141 
143  bool backward_search(sdsl_index_type const & csa,
144  sdsl_char_type const c,
145  size_type & l,
146  size_type & r) const noexcept
147  {
148  assert(l <= r && r < csa.size());
149 
150  size_type _l, _r;
151 
152  size_type cc = c;
153  if constexpr(!std::same_as<index_alphabet_type, sdsl::plain_byte_alphabet>)
154  {
155  cc = csa.char2comp[c];
156  if (cc == 0 && c > 0) // [[unlikely]]
157  return false;
158  }
159 
160  size_type const c_begin = csa.C[cc];
161  if (l == 0 && r + 1 == csa.size()) // [[unlikely]]
162  {
163  _l = c_begin;
164  _r = csa.C[cc + 1] - 1;
165  // if we use not the plain_byte_alphabet, we could return always return true here
166  }
167  else
168  {
169  _l = c_begin + csa.bwt.rank(l, c); // count c in bwt[0..l-1]
170  _r = c_begin + csa.bwt.rank(r + 1, c) - 1; // count c in bwt[0..r]
171  }
172 
173  if (_r >= _l)
174  {
175  r = _r;
176  l = _l;
177  assert(r + 1 - l >= 0);
178  return true;
179  }
180  return false;
181  }
182 
183 public:
184 
189  // Default construction is necessary to make this class semi-regular and e.g., to allow construction of
190  // std::array of iterators.
191  fm_index_cursor() noexcept = default;
192  fm_index_cursor(fm_index_cursor const &) noexcept = default;
193  fm_index_cursor & operator=(fm_index_cursor const &) noexcept = default;
194  fm_index_cursor(fm_index_cursor &&) noexcept = default;
195  fm_index_cursor & operator=(fm_index_cursor &&) noexcept = default;
196  ~fm_index_cursor() = default;
197 
199  fm_index_cursor(index_t const & _index) noexcept :
200  index(&_index),
201  node({0, _index.index.size() - 1, 0, 0}),
202  sigma(_index.index.sigma - index_t::text_layout_mode)
203  {
204  assert(_index.index.size() != 0);
205  }
206  //\}
207 
220  bool operator==(fm_index_cursor const & rhs) const noexcept
221  {
222  assert(index != nullptr);
223  assert(node != rhs.node || (query_length() == 0 || (parent_lb == rhs.parent_lb && parent_rb == rhs.parent_rb)));
224 
225  // position in the implicit suffix tree is defined by the SA interval and depth.
226  // No need to compare parent intervals
227  return node == rhs.node;
228  }
229 
242  bool operator!=(fm_index_cursor const & rhs) const noexcept
243  {
244  assert(index != nullptr);
245 
246  return !(*this == rhs);
247  }
248 
266  bool extend_right() noexcept
267  {
268  // TODO: specialize extend_right() and cycle_back() for EPR-dictionaries
269  // store all cursors at once in a private std::array of cursors
270  assert(index != nullptr);
271 
272  sdsl_char_type c = 1; // NOTE: start with 0 or 1 depending on implicit_sentintel
273  size_type _lb = node.lb, _rb = node.rb;
274  while (c < sigma && !backward_search(index->index, index->index.comp2char[c], _lb, _rb))
275  {
276  ++c;
277  }
278 
279  if (c != sigma)
280  {
281  parent_lb = node.lb;
282  parent_rb = node.rb;
283  node = {_lb, _rb, node.depth + 1, c};
284  return true;
285  }
286  return false;
287  }
288 
302  template <typename char_t>
304  requires std::convertible_to<char_t, index_alphabet_type>
306  bool extend_right(char_t const c) noexcept
307  {
308  assert(index != nullptr);
309  // The rank cannot exceed 255 for single text and 254 for text collections as they are reserved as sentinels
310  // for the indexed text.
311  assert(seqan3::to_rank(static_cast<index_alphabet_type>(c)) <
312  ((index_type::text_layout_mode == text_layout::single) ? 255 : 254));
313 
314  size_type _lb = node.lb, _rb = node.rb;
315 
316  sdsl_char_type c_char = seqan3::to_rank(static_cast<index_alphabet_type>(c)) + 1;
317 
318  if (backward_search(index->index, c_char, _lb, _rb))
319  {
320  parent_lb = node.lb;
321  parent_rb = node.rb;
322  node = {_lb, _rb, node.depth + 1, c_char};
323  return true;
324  }
325  return false;
326  }
327 
329  template <typename char_type>
331  requires detail::is_char_adaptation_v<char_type>
333  bool extend_right(char_type const * cstring) noexcept
334  {
336  }
337 
354  template <std::ranges::range seq_t>
355  bool extend_right(seq_t && seq) noexcept
356  {
357  static_assert(std::ranges::forward_range<seq_t>, "The query must model forward_range.");
358  static_assert(std::convertible_to<range_innermost_value_t<seq_t>, index_alphabet_type>,
359  "The alphabet of the sequence must be convertible to the alphabet of the index.");
360 
361  assert(index != nullptr); // range must not be empty!
362 
363  size_type _lb = node.lb, _rb = node.rb;
364  size_type new_parent_lb = parent_lb, new_parent_rb = parent_rb;
365 
366  sdsl_char_type c{};
367  size_t len{0};
368 
369  for (auto it = std::ranges::begin(seq); it != std::ranges::end(seq); ++len, ++it)
370  {
371  // The rank cannot exceed 255 for single text and 254 for text collections as they are reserved as sentinels
372  // for the indexed text.
373  assert(seqan3::to_rank(static_cast<index_alphabet_type>(*it)) <
374  ((index_type::text_layout_mode == text_layout::single) ? 255 : 254));
375 
376  c = seqan3::to_rank(static_cast<index_alphabet_type>(*it)) + 1;
377 
378  new_parent_lb = _lb;
379  new_parent_rb = _rb;
380  if (!backward_search(index->index, c, _lb, _rb))
381  return false;
382  }
383 
384  parent_lb = new_parent_lb;
385  parent_rb = new_parent_rb;
386  node = {_lb, _rb, len + node.depth, c};
387  return true;
388  }
389 
416  bool cycle_back() noexcept
417  {
418  assert(index != nullptr && query_length() > 0);
419  // parent_lb > parent_rb --> invalid interval
420  assert(parent_lb <= parent_rb);
421 
422  sdsl_char_type c = node.last_char + 1;
423  size_type _lb = parent_lb, _rb = parent_rb;
424 
425  while (c < sigma && !backward_search(index->index, index->index.comp2char[c], _lb, _rb))
426  {
427  ++c;
428  }
429 
430  if (c != sigma) // Collection has additional sentinel as delimiter
431  {
432  node = {_lb, _rb, node.depth, c};
433  return true;
434  }
435  return false;
436  }
437 
453  size_type last_rank() const noexcept
454  {
455  // parent_lb > parent_rb --> invalid interval
456  assert(index != nullptr && query_length() > 0 && parent_lb <= parent_rb);
457 
458  return index->index.comp2char[node.last_char] - 1; // text is not allowed to contain ranks of 0
459  }
460 
477  {
478  assert(index != nullptr);
479 
480  return {node.lb, node.rb + 1};
481  }
482 
501  size_type query_length() const noexcept
502  {
503  assert(index != nullptr);
504  assert(node.depth != 0 || (node.lb == 0 && node.rb == index->size() - 1)); // depth == 0 -> root node
505 
506  return node.depth;
507  }
508 
526  template <std::ranges::range text_t>
527  auto path_label(text_t && text) const noexcept
529  requires (index_t::text_layout_mode == text_layout::single)
531  {
532  static_assert(std::ranges::input_range<text_t>, "The text must model input_range.");
533  static_assert(range_dimension_v<text_t> == 1, "The input cannot be a text collection.");
534  static_assert(std::same_as<range_innermost_value_t<text_t>, index_alphabet_type>,
535  "The alphabet types of the given text and index differ.");
536  assert(index != nullptr);
537 
538  size_type const query_begin = offset() - index->index[node.lb];
539  return text | views::slice(query_begin, query_begin + query_length());
540  }
541 
543  template <std::ranges::range text_t>
544  auto path_label(text_t && text) const noexcept
546  requires (index_t::text_layout_mode == text_layout::collection)
548  {
549  static_assert(std::ranges::input_range<text_t>, "The text collection must model input_range.");
550  static_assert(range_dimension_v<text_t> == 2, "The input must be a text collection.");
551  static_assert(std::same_as<range_innermost_value_t<text_t>, index_alphabet_type>,
552  "The alphabet types of the given text and index differ.");
553  assert(index != nullptr);
554 
555  // Position of query in concatenated text.
556  size_type const location = offset() - index->index[node.lb];
557 
558  // The rank represents the number of start positions of the individual sequences/texts in the collection
559  // before position `location + 1` and thereby also the number of delimiters.
560  size_type const rank = index->text_begin_rs.rank(location + 1);
561  assert(rank > 0);
562  size_type const text_id = rank - 1;
563 
564  // The start location of the `text_id`-th text in the sequence (position of the `rank`-th 1 in the bitvector).
565  size_type const start_location = index->text_begin_ss.select(rank);
566  // Substract lengths of previous sequences.
567  size_type const query_begin = location - start_location;
568 
569  // Take subtext, slice query out of it
570  return text[text_id] | views::slice(query_begin, query_begin + query_length());
571  }
572 
584  size_type count() const noexcept
585  {
586  assert(index != nullptr);
587 
588  return 1 + node.rb - node.lb;
589  }
590 
604  requires (index_t::text_layout_mode == text_layout::single)
606  {
607  assert(index != nullptr);
608 
609  locate_result_type occ{};
610  occ.reserve(count());
611  for (size_type i = 0; i < count(); ++i)
612  {
613  occ.emplace_back(0, offset() - index->index[node.lb + i]);
614  }
615 
616  return occ;
617  }
618 
622  requires (index_t::text_layout_mode == text_layout::collection)
624  {
625  assert(index != nullptr);
626 
627  locate_result_type occ;
628  occ.reserve(count());
629  for (size_type i = 0; i < count(); ++i)
630  {
631  size_type loc = offset() - index->index[node.lb + i];
632  size_type sequence_rank = index->text_begin_rs.rank(loc + 1);
633  size_type sequence_position = loc - index->text_begin_ss.select(sequence_rank);
634  occ.emplace_back(sequence_rank - 1, sequence_position);
635  }
636  return occ;
637  }
638 
651  auto lazy_locate() const
653  requires (index_t::text_layout_mode == text_layout::single)
655  {
656  assert(index != nullptr);
657 
658  return std::views::iota(node.lb, node.lb + count())
659  | std::views::transform([*this, _offset = offset()] (auto sa_pos)
660  {
661  return locate_result_value_type{0u, _offset - index->index[sa_pos]};
662  });
663  }
664 
666  auto lazy_locate() const
668  requires (index_t::text_layout_mode == text_layout::collection)
670  {
671  assert(index != nullptr);
672 
673  return std::views::iota(node.lb, node.lb + count())
674  | std::views::transform([*this, _offset = offset()] (auto sa_pos)
675  {
676  auto loc = _offset - index->index[sa_pos];
677  size_type sequence_rank = index->text_begin_rs.rank(loc + 1);
678  size_type sequence_position = loc - index->text_begin_ss.select(sequence_rank);
679  return locate_result_value_type{sequence_rank - 1, sequence_position};
680  });
681  }
682 
690  template <cereal_archive archive_t>
691  void CEREAL_SERIALIZE_FUNCTION_NAME(archive_t & archive)
692  {
693  archive(parent_lb);
694  archive(parent_rb);
695  archive(node);
696  archive(sigma);
697  }
699 };
700 
702 
703 } // namespace seqan3
Core alphabet concept and free function/type trait wrappers.
Provides alphabet adaptations for standard char types.
The SeqAn FM Index Cursor.
Definition: fm_index_cursor.hpp:90
bool extend_right(seq_t &&seq) noexcept
Tries to extend the query by seq to the right.
Definition: fm_index_cursor.hpp:355
locate_result_type locate() const
Locates the occurrences of the searched query in the text.
Definition: fm_index_cursor.hpp:602
bool extend_right(char_t const c) noexcept
Tries to extend the query by the character c to the right.
Definition: fm_index_cursor.hpp:306
auto path_label(text_t &&text) const noexcept
Returns the searched query.
Definition: fm_index_cursor.hpp:527
bool cycle_back() noexcept
Tries to replace the rightmost character of the query by the next lexicographically larger character ...
Definition: fm_index_cursor.hpp:416
bool operator!=(fm_index_cursor const &rhs) const noexcept
Compares two cursors.
Definition: fm_index_cursor.hpp:242
seqan3::suffix_array_interval suffix_array_interval() const noexcept
Returns the half-open suffix array interval.
Definition: fm_index_cursor.hpp:476
bool operator==(fm_index_cursor const &rhs) const noexcept
Compares two cursors.
Definition: fm_index_cursor.hpp:220
size_type count() const noexcept
Counts the number of occurrences of the searched query in the text.
Definition: fm_index_cursor.hpp:584
size_type last_rank() const noexcept
Outputs the rightmost rank.
Definition: fm_index_cursor.hpp:453
bool extend_right() noexcept
Tries to extend the query by the smallest possible character to the right such that the query is foun...
Definition: fm_index_cursor.hpp:266
size_type query_length() const noexcept
Returns the length of the searched query.
Definition: fm_index_cursor.hpp:501
index_t index_type
Type of the index.
Definition: fm_index_cursor.hpp:96
typename index_type::size_type size_type
Type for representing positions in the indexed text.
Definition: fm_index_cursor.hpp:98
auto lazy_locate() const
Locates the occurrences of the searched query in the text on demand, i.e. a std::ranges::view is retu...
Definition: fm_index_cursor.hpp:651
bool extend_right(char_type const *cstring) noexcept
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition: fm_index_cursor.hpp:333
fm_index_cursor() noexcept=default
Default constructor. Accessing member functions on a default constructed object is undefined behavior...
Provides various transformation traits used by the range module.
Provides the internal representation of a node of the seqan3::fm_index_cursor.
T emplace_back(T... args)
constexpr auto to_rank
Return the rank representation of a (semi-)alphabet object.
Definition: concept.hpp:155
typename range_innermost_value< t >::type range_innermost_value_t
Shortcut for seqan3::range_innermost_value (transformation_trait shortcut).
Definition: type_traits.hpp:242
@ seq
The "sequence", usually a range of nucleotides or amino acids.
text_layout
The possible text layouts (single, collection) the seqan3::fm_index and seqan3::bi_fm_index can suppo...
Definition: concept.hpp:81
@ single
The text is a single range.
Definition: concept.hpp:83
@ collection
The text is a range of ranges.
Definition: concept.hpp:85
decltype(detail::transform< trait_t >(list_t{})) transform
Apply a transformation trait to every type in the list and return a seqan3::type_list of the results.
Definition: traits.hpp:471
constexpr auto slice
A view adaptor that returns a half-open interval on the underlying range.
Definition: slice.hpp:189
The main SeqAn3 namespace.
Definition: aligned_sequence_concept.hpp:29
Adaptations of concepts from the Ranges TS.
T reserve(T... args)
Provides the concept for seqan3::detail::sdsl_index.
Exposes the size_type of another type.
Definition: pre.hpp:296
The underlying suffix array interval.
Definition: fm_index_cursor.hpp:37
size_t end_position
The exclusive end position of the interval ("right boundary").
Definition: fm_index_cursor.hpp:41
size_t begin_position
The begin position of the interval ("left boundary").
Definition: fm_index_cursor.hpp:39
friend bool operator==(suffix_array_interval const &lhs, suffix_array_interval const &rhs) noexcept
Test for equality.
Definition: fm_index_cursor.hpp:51
friend bool operator!=(suffix_array_interval const &lhs, suffix_array_interval const &rhs) noexcept
Test for inequality.
Definition: fm_index_cursor.hpp:61
Provides seqan3::views::slice.