SeqAn3 3.2.0
The Modern C++ library for sequence analysis.
fm_index_cursor.hpp
Go to the documentation of this file.
1// -----------------------------------------------------------------------------------------------------
2// Copyright (c) 2006-2022, Knut Reinert & Freie Universität Berlin
3// Copyright (c) 2016-2022, Knut Reinert & MPI für molekulare Genetik
4// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6// -----------------------------------------------------------------------------------------------------
7
13#pragma once
14
15#include <array>
16#include <ranges>
17#include <type_traits>
18
19#include <sdsl/suffix_trees.hpp>
20
27
28namespace seqan3
29{
33{
37 size_t end_position{};
38
47 friend bool operator==(suffix_array_interval const & lhs, suffix_array_interval const & rhs) noexcept
48 {
49 return lhs.begin_position == rhs.begin_position && lhs.end_position == rhs.end_position;
50 }
51
57 friend bool operator!=(suffix_array_interval const & lhs, suffix_array_interval const & rhs) noexcept
58 {
59 return !(lhs == rhs);
60 }
62};
63
85template <typename index_t>
87{
88public:
93 using index_type = index_t;
95 using size_type = typename index_type::size_type;
97
98private:
103 using node_type = detail::fm_index_cursor_node<index_t>;
105 using sdsl_char_type = typename index_type::sdsl_char_type;
107 using sdsl_index_type = typename index_t::sdsl_index_type;
109 using sdsl_sigma_type = typename index_type::sdsl_sigma_type;
111 using index_alphabet_type = typename index_t::alphabet_type;
117
119 index_type const * index{nullptr};
121 size_type parent_lb{};
123 size_type parent_rb{};
125 node_type node{};
127 sdsl_sigma_type sigma{};
128
129 template <typename _index_t>
130 friend class bi_fm_index_cursor;
131
133 size_type offset() const noexcept
134 {
135 assert(index->index.size() > query_length());
136 return index->index.size() - query_length() - 1; // since the string is reversed during construction
137 }
138
140 bool
141 backward_search(sdsl_index_type const & csa, sdsl_char_type const c, size_type & l, size_type & r) const noexcept
142 {
143 assert(l <= r && r < csa.size());
144
145 size_type _l, _r;
146
147 size_type cc = c;
148 if constexpr (!std::same_as<index_alphabet_type, sdsl::plain_byte_alphabet>)
149 {
150 cc = csa.char2comp[c];
151 if (cc == 0 && c > 0) // [[unlikely]]
152 return false;
153 }
154
155 size_type const c_begin = csa.C[cc];
156 if (l == 0 && r + 1 == csa.size()) // [[unlikely]]
157 {
158 _l = c_begin;
159 _r = csa.C[cc + 1] - 1;
160 // if we use not the plain_byte_alphabet, we could return always return true here
161 }
162 else
163 {
164 _l = c_begin + csa.bwt.rank(l, c); // count c in bwt[0..l-1]
165 _r = c_begin + csa.bwt.rank(r + 1, c) - 1; // count c in bwt[0..r]
166 }
167
168 if (_r >= _l)
169 {
170 r = _r;
171 l = _l;
172 assert(r + 1 - l >= 0);
173 return true;
174 }
175 return false;
176 }
177
178public:
183 // Default construction is necessary to make this class semi-regular and e.g., to allow construction of
184 // std::array of iterators.
185 fm_index_cursor() noexcept = default;
186 fm_index_cursor(fm_index_cursor const &) noexcept = default;
187 fm_index_cursor & operator=(fm_index_cursor const &) noexcept = default;
188 fm_index_cursor(fm_index_cursor &&) noexcept = default;
189 fm_index_cursor & operator=(fm_index_cursor &&) noexcept = default;
190 ~fm_index_cursor() = default;
191
193 fm_index_cursor(index_t const & _index) noexcept :
194 index(&_index),
195 node({0, _index.index.size() - 1, 0, 0}),
196 sigma(_index.index.sigma - index_t::text_layout_mode)
197 {
198 assert(_index.index.size() != 0);
199 }
200 //\}
201
214 bool operator==(fm_index_cursor const & rhs) const noexcept
215 {
216 assert(index != nullptr);
217 assert(node != rhs.node || (query_length() == 0 || (parent_lb == rhs.parent_lb && parent_rb == rhs.parent_rb)));
218
219 // position in the implicit suffix tree is defined by the SA interval and depth.
220 // No need to compare parent intervals
221 return node == rhs.node;
222 }
223
236 bool operator!=(fm_index_cursor const & rhs) const noexcept
237 {
238 assert(index != nullptr);
239
240 return !(*this == rhs);
241 }
242
260 bool extend_right() noexcept
261 {
262 // TODO: specialize extend_right() and cycle_back() for EPR-dictionaries
263 // store all cursors at once in a private std::array of cursors
264 assert(index != nullptr);
265
266 sdsl_char_type c = 1; // NOTE: start with 0 or 1 depending on implicit_sentintel
267 size_type _lb = node.lb, _rb = node.rb;
268 while (c < sigma && !backward_search(index->index, index->index.comp2char[c], _lb, _rb))
269 {
270 ++c;
271 }
272
273 if (c != sigma)
274 {
275 parent_lb = node.lb;
276 parent_rb = node.rb;
277 node = {_lb, _rb, node.depth + 1, c};
278 return true;
279 }
280 return false;
281 }
282
296 template <typename char_t>
297 requires std::convertible_to<char_t, index_alphabet_type> bool
298 extend_right(char_t const c) noexcept
299 {
300 assert(index != nullptr);
301 // The rank cannot exceed 255 for single text and 254 for text collections as they are reserved as sentinels
302 // for the indexed text.
303 assert(seqan3::to_rank(static_cast<index_alphabet_type>(c))
304 < ((index_type::text_layout_mode == text_layout::single) ? 255 : 254));
305
306 size_type _lb = node.lb, _rb = node.rb;
307
308 sdsl_char_type c_char = seqan3::to_rank(static_cast<index_alphabet_type>(c)) + 1;
309
310 if (backward_search(index->index, c_char, _lb, _rb))
311 {
312 parent_lb = node.lb;
313 parent_rb = node.rb;
314 node = {_lb, _rb, node.depth + 1, c_char};
315 return true;
316 }
317 return false;
318 }
319
321 template <typename char_type>
322 requires detail::is_char_adaptation_v<char_type> bool
323 extend_right(char_type const * cstring) noexcept
324 {
326 }
327
344 template <std::ranges::range seq_t>
345 bool extend_right(seq_t && seq) noexcept
346 {
347 static_assert(std::ranges::forward_range<seq_t>, "The query must model forward_range.");
348 static_assert(std::convertible_to<range_innermost_value_t<seq_t>, index_alphabet_type>,
349 "The alphabet of the sequence must be convertible to the alphabet of the index.");
350
351 assert(index != nullptr); // range must not be empty!
352
353 size_type _lb = node.lb, _rb = node.rb;
354 size_type new_parent_lb = parent_lb, new_parent_rb = parent_rb;
355
356 sdsl_char_type c{};
357 size_t len{0};
358
359 for (auto it = std::ranges::begin(seq); it != std::ranges::end(seq); ++len, ++it)
360 {
361 // The rank cannot exceed 255 for single text and 254 for text collections as they are reserved as sentinels
362 // for the indexed text.
363 assert(seqan3::to_rank(static_cast<index_alphabet_type>(*it))
364 < ((index_type::text_layout_mode == text_layout::single) ? 255 : 254));
365
366 c = seqan3::to_rank(static_cast<index_alphabet_type>(*it)) + 1;
367
368 new_parent_lb = _lb;
369 new_parent_rb = _rb;
370 if (!backward_search(index->index, c, _lb, _rb))
371 return false;
372 }
373
374 parent_lb = new_parent_lb;
375 parent_rb = new_parent_rb;
376 node = {_lb, _rb, len + node.depth, c};
377 return true;
378 }
379
406 bool cycle_back() noexcept
407 {
408 assert(index != nullptr && query_length() > 0);
409 // parent_lb > parent_rb --> invalid interval
410 assert(parent_lb <= parent_rb);
411
412 sdsl_char_type c = node.last_char + 1;
413 size_type _lb = parent_lb, _rb = parent_rb;
414
415 while (c < sigma && !backward_search(index->index, index->index.comp2char[c], _lb, _rb))
416 {
417 ++c;
418 }
419
420 if (c != sigma) // Collection has additional sentinel as delimiter
421 {
422 node = {_lb, _rb, node.depth, c};
423 return true;
424 }
425 return false;
426 }
427
443 size_type last_rank() const noexcept
444 {
445 // parent_lb > parent_rb --> invalid interval
446 assert(index != nullptr && query_length() > 0 && parent_lb <= parent_rb);
447
448 return index->index.comp2char[node.last_char] - 1; // text is not allowed to contain ranks of 0
449 }
450
467 {
468 assert(index != nullptr);
469
470 return {node.lb, node.rb + 1};
471 }
472
491 size_type query_length() const noexcept
492 {
493 assert(index != nullptr);
494 assert(node.depth != 0 || (node.lb == 0 && node.rb == index->size() - 1)); // depth == 0 -> root node
495
496 return node.depth;
497 }
498
516 template <std::ranges::range text_t>
517 auto path_label(text_t && text) const noexcept
518 requires (index_t::text_layout_mode == text_layout::single)
519 {
520 static_assert(std::ranges::input_range<text_t>, "The text must model input_range.");
521 static_assert(range_dimension_v<text_t> == 1, "The input cannot be a text collection.");
522 static_assert(std::same_as<range_innermost_value_t<text_t>, index_alphabet_type>,
523 "The alphabet types of the given text and index differ.");
524 assert(index != nullptr);
525
526 size_type const query_begin = offset() - index->index[node.lb];
527 return text | views::slice(query_begin, query_begin + query_length());
528 }
529
531 template <std::ranges::range text_t>
532 auto path_label(text_t && text) const noexcept
533 requires (index_t::text_layout_mode == text_layout::collection)
534 {
535 static_assert(std::ranges::input_range<text_t>, "The text collection must model input_range.");
536 static_assert(range_dimension_v<text_t> == 2, "The input must be a text collection.");
537 static_assert(std::same_as<range_innermost_value_t<text_t>, index_alphabet_type>,
538 "The alphabet types of the given text and index differ.");
539 assert(index != nullptr);
540
541 // Position of query in concatenated text.
542 size_type const location = offset() - index->index[node.lb];
543
544 // The rank represents the number of start positions of the individual sequences/texts in the collection
545 // before position `location + 1` and thereby also the number of delimiters.
546 size_type const rank = index->text_begin_rs.rank(location + 1);
547 assert(rank > 0);
548 size_type const text_id = rank - 1;
549
550 // The start location of the `text_id`-th text in the sequence (position of the `rank`-th 1 in the bitvector).
551 size_type const start_location = index->text_begin_ss.select(rank);
552 // Substract lengths of previous sequences.
553 size_type const query_begin = location - start_location;
554
555 // Take subtext, slice query out of it
556 return text[text_id] | views::slice(query_begin, query_begin + query_length());
557 }
558
570 size_type count() const noexcept
571 {
572 assert(index != nullptr);
573
574 return 1 + node.rb - node.lb;
575 }
576
589 requires (index_t::text_layout_mode == text_layout::single)
590 {
591 assert(index != nullptr);
592
593 locate_result_type occ{};
594 occ.reserve(count());
595 for (size_type i = 0; i < count(); ++i)
596 {
597 occ.emplace_back(0, offset() - index->index[node.lb + i]);
598 }
599
600 return occ;
601 }
602
605 requires (index_t::text_layout_mode == text_layout::collection)
606 {
607 assert(index != nullptr);
608
610 occ.reserve(count());
611 for (size_type i = 0; i < count(); ++i)
612 {
613 size_type loc = offset() - index->index[node.lb + i];
614 size_type sequence_rank = index->text_begin_rs.rank(loc + 1);
615 size_type sequence_position = loc - index->text_begin_ss.select(sequence_rank);
616 occ.emplace_back(sequence_rank - 1, sequence_position);
617 }
618 return occ;
619 }
620
633 auto lazy_locate() const
634 requires (index_t::text_layout_mode == text_layout::single)
635 {
636 assert(index != nullptr);
637
638 return std::views::iota(node.lb, node.lb + count())
640 [*this, _offset = offset()](auto sa_pos)
641 {
642 return locate_result_value_type{0u, _offset - index->index[sa_pos]};
643 });
644 }
645
647 auto lazy_locate() const
648 requires (index_t::text_layout_mode == text_layout::collection)
649 {
650 assert(index != nullptr);
651
652 return std::views::iota(node.lb, node.lb + count())
654 [*this, _offset = offset()](auto sa_pos)
655 {
656 auto loc = _offset - index->index[sa_pos];
657 size_type sequence_rank = index->text_begin_rs.rank(loc + 1);
658 size_type sequence_position = loc - index->text_begin_ss.select(sequence_rank);
659 return locate_result_value_type{sequence_rank - 1, sequence_position};
660 });
661 }
662
670 template <cereal_archive archive_t>
671 void CEREAL_SERIALIZE_FUNCTION_NAME(archive_t & archive)
672 {
673 archive(parent_lb);
674 archive(parent_rb);
675 archive(node);
676 archive(sigma);
677 }
679};
680
681} // namespace seqan3
Core alphabet concept and free function/type trait wrappers.
T begin(T... args)
Provides alphabet adaptations for standard char types.
The SeqAn FM Index Cursor.
Definition: fm_index_cursor.hpp:87
bool extend_right(seq_t &&seq) noexcept
Tries to extend the query by seq to the right.
Definition: fm_index_cursor.hpp:345
auto path_label(text_t &&text) const noexcept
Returns the searched query.
Definition: fm_index_cursor.hpp:517
bool cycle_back() noexcept
Tries to replace the rightmost character of the query by the next lexicographically larger character ...
Definition: fm_index_cursor.hpp:406
bool operator!=(fm_index_cursor const &rhs) const noexcept
Compares two cursors.
Definition: fm_index_cursor.hpp:236
seqan3::suffix_array_interval suffix_array_interval() const noexcept
Returns the half-open suffix array interval.
Definition: fm_index_cursor.hpp:466
locate_result_type locate() const
Locates the occurrences of the searched query in the text.
Definition: fm_index_cursor.hpp:588
auto lazy_locate() const
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition: fm_index_cursor.hpp:647
bool operator==(fm_index_cursor const &rhs) const noexcept
Compares two cursors.
Definition: fm_index_cursor.hpp:214
size_type count() const noexcept
Counts the number of occurrences of the searched query in the text.
Definition: fm_index_cursor.hpp:570
size_type last_rank() const noexcept
Outputs the rightmost rank.
Definition: fm_index_cursor.hpp:443
bool extend_right() noexcept
Tries to extend the query by the smallest possible character to the right such that the query is foun...
Definition: fm_index_cursor.hpp:260
bool extend_right(char_type const *cstring) noexcept
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition: fm_index_cursor.hpp:323
locate_result_type locate() const
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition: fm_index_cursor.hpp:604
auto path_label(text_t &&text) const noexcept
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition: fm_index_cursor.hpp:532
size_type query_length() const noexcept
Returns the length of the searched query. .
Definition: fm_index_cursor.hpp:491
index_t index_type
Type of the index.
Definition: fm_index_cursor.hpp:93
typename index_type::size_type size_type
Type for representing positions in the indexed text.
Definition: fm_index_cursor.hpp:95
auto lazy_locate() const
Locates the occurrences of the searched query in the text on demand, i.e. a std::ranges::view is retu...
Definition: fm_index_cursor.hpp:633
bool extend_right(char_t const c) noexcept
Tries to extend the query by the character c to the right.
Definition: fm_index_cursor.hpp:298
fm_index_cursor() noexcept=default
Default constructor. Accessing member functions on a default constructed object is undefined behavior...
Provides various transformation traits used by the range module.
Provides the internal representation of a node of the seqan3::fm_index_cursor.
T emplace_back(T... args)
constexpr auto to_rank
Return the rank representation of a (semi-)alphabet object.
Definition: concept.hpp:155
@ seq
The "sequence", usually a range of nucleotides or amino acids.
text_layout
The possible text layouts (single, collection) the seqan3::fm_index and seqan3::bi_fm_index can suppo...
Definition: concept.hpp:91
@ single
The text is a single range.
Definition: concept.hpp:93
@ collection
The text is a range of ranges.
Definition: concept.hpp:95
decltype(detail::transform< trait_t >(list_t{})) transform
Apply a transformation trait to every type in the list and return a seqan3::type_list of the results.
Definition: traits.hpp:470
constexpr auto slice
A view adaptor that returns a half-open interval on the underlying range.
Definition: slice.hpp:178
The main SeqAn3 namespace.
Definition: aligned_sequence_concept.hpp:29
T reserve(T... args)
Provides the concept for seqan3::detail::sdsl_index.
Provides seqan3::views::slice.
The underlying suffix array interval.
Definition: fm_index_cursor.hpp:33
size_t end_position
The exclusive end position of the interval ("right boundary").
Definition: fm_index_cursor.hpp:37
size_t begin_position
The begin position of the interval ("left boundary").
Definition: fm_index_cursor.hpp:35
friend bool operator==(suffix_array_interval const &lhs, suffix_array_interval const &rhs) noexcept
Test for equality.
Definition: fm_index_cursor.hpp:47
friend bool operator!=(suffix_array_interval const &lhs, suffix_array_interval const &rhs) noexcept
Test for inequality.
Definition: fm_index_cursor.hpp:57