SeqAn3  3.0.0
The Modern C++ library for sequence analysis.
fm_index_cursor.hpp
Go to the documentation of this file.
1 // -----------------------------------------------------------------------------------------------------
2 // Copyright (c) 2006-2019, Knut Reinert & Freie Universität Berlin
3 // Copyright (c) 2016-2019, Knut Reinert & MPI für molekulare Genetik
4 // This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5 // shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6 // -----------------------------------------------------------------------------------------------------
7 
13 #pragma once
14 
15 #include <array>
16 #include <type_traits>
17 
18 #include <sdsl/suffix_trees.hpp>
19 
20 #include <range/v3/view/slice.hpp>
21 
22 #include <seqan3/alphabet/all.hpp>
28 #include <seqan3/std/ranges>
29 
30 // namespace seqan3::detail
31 // {
32 // // forward declaration
33 // auto get_suffix_array_range(fm_index_cursor<index_t> const & it);
34 // } // namespace seqan3::detail
35 
36 namespace seqan3
37 {
38 
63 template <typename index_t>
65 {
66 
67 public:
68 
72  using index_type = index_t;
75  using size_type = typename index_type::size_type;
77 
78 protected:
80 
84  using node_type = detail::fm_index_cursor_node<index_t>;
87  using sdsl_char_type = typename index_type::sdsl_char_type;
89  using sdsl_sigma_type = typename index_type::sdsl_sigma_type;
91 
93  index_type const * index;
95  size_type parent_lb;
97  size_type parent_rb;
99  node_type node;
101  sdsl_sigma_type sigma;
102 
103  template <typename _index_t>
104  friend class bi_fm_index_cursor;
105 
106  // friend detail::get_suffix_array_range;
107 
109  size_type offset() const noexcept
110  {
111  assert(index->index.size() > query_length());
112  return index->index.size() - query_length() - 1; // since the string is reversed during construction
113  }
114 
116  template <detail::SdslIndex csa_t>
117  bool backward_search(csa_t const & csa, sdsl_char_type const c, size_type & l, size_type & r) const noexcept
118  {
119  assert(l <= r && r < csa.size());
120 
121  size_type _l, _r;
122 
123  size_type cc = c;
125  {
126  cc = csa.char2comp[c];
127  if (cc == 0 && c > 0) // [[unlikely]]
128  return false;
129  }
130 
131  size_type const c_begin = csa.C[cc];
132  if (l == 0 && r + 1 == csa.size()) // [[unlikely]]
133  {
134  _l = c_begin;
135  _r = csa.C[cc + 1] - 1;
136  // if we use not the plain_byte_alphabet, we could return always return true here
137  }
138  else
139  {
140  _l = c_begin + csa.bwt.rank(l, c); // count c in bwt[0..l-1]
141  _r = c_begin + csa.bwt.rank(r + 1, c) - 1; // count c in bwt[0..r]
142  }
143 
144  if (_r >= _l)
145  {
146  r = _r;
147  l = _l;
148  assert(r + 1 - l >= 0);
149  return true;
150  }
151  return false;
152  }
153 
154 public:
155 
159  // Default construction is necessary to make this class semi-regular and e.g., to allow construction of
161  // std::array of iterators.
162  fm_index_cursor() noexcept = default;
163  fm_index_cursor(fm_index_cursor const &) noexcept = default;
164  fm_index_cursor & operator=(fm_index_cursor const &) noexcept = default;
165  fm_index_cursor(fm_index_cursor &&) noexcept = default;
166  fm_index_cursor & operator=(fm_index_cursor &&) noexcept = default;
167  ~fm_index_cursor() = default;
168 
170  fm_index_cursor(index_t const & _index) noexcept : index(&_index), node({0, _index.index.size() - 1, 0, 0}),
171  sigma(_index.index.sigma - index_t::is_collection_)
172  {}
173  //\}
174 
187  bool operator==(fm_index_cursor const & rhs) const noexcept
188  {
189  assert(index != nullptr);
190  assert(node != rhs.node || (query_length() == 0 || (parent_lb == rhs.parent_lb && parent_rb == rhs.parent_rb)));
191 
192  // position in the implicit suffix tree is defined by the SA interval and depth.
193  // No need to compare parent intervals
194  return node == rhs.node;
195  }
196 
209  bool operator!=(fm_index_cursor const & rhs) const noexcept
210  {
211  assert(index != nullptr);
212 
213  return !(*this == rhs);
214  }
215 
233  bool extend_right() noexcept
234  {
235  // TODO: specialize extend_right() and cycle_back() for EPR-dictionaries
236  // store all cursors at once in a private std::array of cursors
237  assert(index != nullptr);
238 
239  sdsl_char_type c = 1; // NOTE: start with 0 or 1 depending on implicit_sentintel
240  size_type _lb = node.lb, _rb = node.rb;
241  while (c < sigma && !backward_search(index->index, index->index.comp2char[c], _lb, _rb))
242  {
243  ++c;
244  }
245 
246  if (c != sigma)
247  {
248  parent_lb = node.lb;
249  parent_rb = node.rb;
250  node = {_lb, _rb, node.depth + 1, c};
251  return true;
252  }
253  return false;
254  }
255 
270  template <Alphabet char_t>
271  bool extend_right(char_t const c) noexcept
272  {
273  assert(index != nullptr);
274  assert(index->sigma == alphabet_size<char_t>);
275 
276  size_type _lb = node.lb, _rb = node.rb;
277 
278  sdsl_char_type c_char = to_rank(c) + 1;
279 
280  if (backward_search(index->index, c_char, _lb, _rb))
281  {
282  parent_lb = node.lb;
283  parent_rb = node.rb;
284  node = {_lb, _rb, node.depth + 1, c_char};
285  return true;
286  }
287  return false;
288  }
289 
306  template <std::ranges::Range seq_t>
307  bool extend_right(seq_t && seq) noexcept
308  {
309  static_assert(std::ranges::ForwardRange<seq_t>, "The query must model ForwardRange.");
310  assert(index != nullptr); // range must not be empty!
311  assert(index->sigma == alphabet_size<innermost_value_type_t<seq_t>>);
312 
313  size_type _lb = node.lb, _rb = node.rb;
314  size_type new_parent_lb = parent_lb, new_parent_rb = parent_rb;
315 
316  sdsl_char_type c{};
317  size_t len{0};
318 
319  for (auto it = std::ranges::begin(seq); it != std::ranges::end(seq); ++len, ++it)
320  {
321  c = to_rank(*it) + 1;
322 
323  new_parent_lb = _lb;
324  new_parent_rb = _rb;
325  if (!backward_search(index->index, c, _lb, _rb))
326  return false;
327  }
328 
329  parent_lb = new_parent_lb;
330  parent_rb = new_parent_rb;
331  node = {_lb, _rb, len + node.depth, c};
332  return true;
333  }
334 
361  bool cycle_back() noexcept
362  {
363  assert(index != nullptr && query_length() > 0);
364  // parent_lb > parent_rb --> invalid interval
365  assert(parent_lb <= parent_rb);
366 
367  sdsl_char_type c = node.last_char + 1;
368  size_type _lb = parent_lb, _rb = parent_rb;
369 
370  while (c < sigma && !backward_search(index->index, index->index.comp2char[c], _lb, _rb))
371  {
372  ++c;
373  }
374 
375  if (c != sigma) // Collection has additional sentinel as delimiter
376  {
377  node = {_lb, _rb, node.depth, c};
378  return true;
379  }
380  return false;
381  }
382 
398  size_type last_rank() const noexcept
399  {
400  // parent_lb > parent_rb --> invalid interval
401  assert(index != nullptr && query_length() > 0 && parent_lb <= parent_rb);
402 
403  return index->index.comp2char[node.last_char] - 1; // text is not allowed to contain ranks of 0
404  }
405 
420  size_type query_length() const noexcept
421  {
422  assert(index != nullptr);
423  assert(node.depth != 0 || (node.lb == 0 && node.rb == index->size() - 1)); // depth == 0 -> root node
424 
425  return node.depth;
426  }
427 
445  template <std::ranges::Range text_t>
446  auto path_label(text_t && text) const noexcept
448  requires !index_t::is_collection_
450  {
451  static_assert(std::ranges::InputRange<text_t>, "The text must model InputRange.");
452  static_assert(!(dimension_v<text_t> != 1), "The input cannot be a text collection.");
453  assert(index != nullptr);
454  assert(index->sigma == alphabet_size<value_type_t<text_t>>);
455 
456  size_type const query_begin = offset() - index->index[node.lb];
457  return text | view::slice(query_begin, query_begin + query_length());
458  }
459 
461  template <std::ranges::Range text_t>
462  auto path_label(text_t && text) const noexcept
464  requires index_t::is_collection_
466  {
467  static_assert(std::ranges::InputRange<text_t>, "The text collection must model InputRange.");
468  static_assert(!(dimension_v<text_t> != 2), "The input must be a text collection.");
469  assert(index != nullptr);
470  assert(index->sigma == alphabet_size<innermost_value_type_t<text_t>>);
471 
472  size_type const loc = offset() - index->index[node.lb];
473  size_type const query_begin = loc - index->text_begin_rs.rank(loc + 1) + 1; // Substract delimiters
474  return text | std::view::join | view::slice(query_begin, query_begin + query_length());
475  }
476 
488  size_type count() const noexcept
489  {
490  assert(index != nullptr);
491 
492  return 1 + node.rb - node.lb;
493  }
494 
508  requires !index_t::is_collection_
510  {
511  assert(index != nullptr);
512 
514  for (size_type i = 0; i < occ.size(); ++i)
515  {
516  occ[i] = offset() - index->index[node.lb + i];
517  }
518  return occ;
519  }
520 
524  requires index_t::is_collection_
526  {
527  assert(index != nullptr);
528 
530  occ.reserve(count());
531  for (size_type i = 0; i < count(); ++i)
532  {
533  size_type loc = offset() - index->index[node.lb + i];
534  size_type sequence_rank = index->text_begin_rs.rank(loc + 1);
535  size_type sequence_position = loc - index->text_begin_ss.select(sequence_rank);
536  occ.emplace_back(sequence_rank - 1, sequence_position);
537  }
538  return occ;
539  }
540 
553  auto lazy_locate() const
555  requires !index_t::is_collection_
557  {
558  assert(index != nullptr);
559 
560  return std::view::iota(node.lb, node.lb + count())
561  | std::view::transform([*this, _offset = offset()] (auto sa_pos) { return _offset - index->index[sa_pos]; });
562  }
563 
565  auto lazy_locate() const
567  requires index_t::is_collection_
569  {
570  assert(index != nullptr);
571 
572  return std::view::iota(node.lb, node.lb + count())
573  | std::view::transform([*this, _offset = offset()] (auto sa_pos) { return _offset - index->index[sa_pos]; })
574  | std::view::transform([*this] (auto loc)
575  {
576  size_type sequence_rank = index->text_begin_rs.rank(loc + 1);
577  size_type sequence_position = loc - index->text_begin_ss.select(sequence_rank);
578  return std::make_pair(sequence_rank-1, sequence_position);
579  });
580  }
581 
582 };
583 
585 
586 } // namespace seqan3
fm_index_cursor() noexcept=default
Default constructor. Accessing member functions on a default constructed object is undefined behavior...
bool operator==(fm_index_cursor const &rhs) const noexcept
Compares two cursors.
Definition: fm_index_cursor.hpp:187
typename value_type< t >::type value_type_t
Shortcut for seqan3::value_type (TransformationTrait shortcut).
Definition: pre.hpp:48
typename index_type::size_type size_type
Type for representing positions in the indexed text.
Definition: fm_index_cursor.hpp:75
constexpr sequenced_policy seq
Global execution policy object for sequenced execution policy.
Definition: execution.hpp:54
bool extend_right(char_t const c) noexcept
Tries to extend the query by the character c to the right.
Definition: fm_index_cursor.hpp:271
bool operator!=(fm_index_cursor const &rhs) const noexcept
Compares two cursors.
Definition: fm_index_cursor.hpp:209
size_type last_rank() const noexcept
Outputs the rightmost rank.
Definition: fm_index_cursor.hpp:398
constexpr auto to_rank
Return the rank representation of a (semi-)alphabet object.
Definition: concept.hpp:103
std::vector< size_type > locate() const
Locates the occurrences of the searched query in the text.
Definition: fm_index_cursor.hpp:506
size_type count() const noexcept
Counts the number of occurrences of the searched query in the text.
Definition: fm_index_cursor.hpp:488
Provides an alphabet mapping that implements an identity map (i.e. each character is mapped to its ra...
The main SeqAn3 namespace.
constexpr auto join
Flattens a View of ranges into a View.
Definition: ranges:683
constexpr auto slice
A view adaptor that returns a half-open interval on the underlying range.
Definition: slice.hpp:144
index_t index_type
Type of the index.
Definition: fm_index_cursor.hpp:73
Meta-header for the alphabet module.
Adaptations of concepts from the Ranges TS.
typename innermost_value_type< t >::type innermost_value_type_t
Shortcut for seqan3::innermost_value_type (TransformationTrait shortcut).
Definition: range.hpp:191
::ranges::begin begin
Alias for ranges::begin. Returns an iterator to the beginning of a range.
Definition: ranges:174
T make_pair(T... args)
Exposes the size_type of another type.
Definition: pre.hpp:188
constexpr auto iota
Generates a sequence of elements by repeatedly incrementing an initial value.
Definition: ranges:647
bool extend_right(seq_t &&seq) noexcept
Tries to extend the query by seq to the right.
Definition: fm_index_cursor.hpp:307
auto path_label(text_t &&text) const noexcept
Returns the searched query.
Definition: fm_index_cursor.hpp:446
T size(T... args)
index_type const * index
Type of the underlying FM index.
Definition: bi_fm_index_cursor.hpp:83
size_type query_length() const noexcept
Returns the length of the searched query.
Definition: fm_index_cursor.hpp:420
Specifies requirements of a Range type for which begin returns a type that models std::InputIterator...
The SeqAn Bidirectional FM Index Cursor.
Definition: bi_fm_index_cursor.hpp:54
Provides seqan3::view::slice.
bool extend_right() noexcept
Tries to extend the query by the smallest possible character to the right such that the query is foun...
Definition: fm_index_cursor.hpp:233
std::vector< std::pair< size_type, size_type > > locate() const
Definition: fm_index_cursor.hpp:522
bool cycle_back() noexcept
Tries to replace the rightmost character of the query by the next lexicographically larger character ...
Definition: fm_index_cursor.hpp:361
Provides various transformation traits used by the range module.
Provides the internal representation of a node of the seqan3::fm_index_cursor.
Specifies requirements of a Range type for which begin returns a type that models std::ForwardIterato...
Provides the unidirectional seqan3::fm_index.
The concept std::Same<T, U> is satisfied if and only if T and U denote the same type.
constexpr auto alphabet_size
A type trait that holds the size of a (semi-)alphabet.
Definition: concept.hpp:678
auto lazy_locate() const
Locates the occurrences of the searched query in the text on demand, i.e. a ranges::view is returned ...
Definition: fm_index_cursor.hpp:553
::ranges::end end
Alias for ranges::end. Returns an iterator to the end of a range.
Definition: ranges:179
constexpr auto transform
A range adaptor that takes a invocable and returns a view of the elements with the invocable applied...
Definition: ranges:911
The SeqAn FM Index Cursor.
Definition: fm_index_cursor.hpp:64
T reserve(T... args)
T emplace_back(T... args)