SeqAn3 3.4.0-rc.4
The Modern C++ library for sequence analysis.
Loading...
Searching...
No Matches
fm_index_cursor.hpp
Go to the documentation of this file.
1// SPDX-FileCopyrightText: 2006-2025 Knut Reinert & Freie Universität Berlin
2// SPDX-FileCopyrightText: 2016-2025 Knut Reinert & MPI für molekulare Genetik
3// SPDX-License-Identifier: BSD-3-Clause
4
10#pragma once
11
12#include <array>
13#include <ranges>
14#include <type_traits>
15
18#include <seqan3/contrib/sdsl-lite.hpp>
23
24namespace seqan3
25{
29{
33 size_t end_position{};
34
43 friend bool operator==(suffix_array_interval const & lhs, suffix_array_interval const & rhs) noexcept
44 {
45 return lhs.begin_position == rhs.begin_position && lhs.end_position == rhs.end_position;
46 }
47
53 friend bool operator!=(suffix_array_interval const & lhs, suffix_array_interval const & rhs) noexcept
54 {
55 return !(lhs == rhs);
56 }
58};
59
81template <typename index_t>
83{
84public:
89 using index_type = index_t;
91 using size_type = typename index_type::size_type;
93
94private:
99 using node_type = detail::fm_index_cursor_node<index_t>;
101 using sdsl_char_type = typename index_type::sdsl_char_type;
103 using sdsl_index_type = typename index_t::sdsl_index_type;
105 using sdsl_sigma_type = typename index_type::sdsl_sigma_type;
107 using index_alphabet_type = typename index_t::alphabet_type;
113
115 index_type const * index{nullptr};
117 size_type parent_lb{};
119 size_type parent_rb{};
121 node_type node{};
123 sdsl_sigma_type sigma{};
124
125 template <typename _index_t>
126 friend class bi_fm_index_cursor;
127
129 size_type offset() const noexcept
130 {
131 assert(index->index.size() > query_length());
132 return index->index.size() - query_length() - 1; // since the string is reversed during construction
133 }
134
136 bool
137 backward_search(sdsl_index_type const & csa, sdsl_char_type const c, size_type & l, size_type & r) const noexcept
138 {
139 assert(l <= r && r < csa.size());
140
141 size_type _l, _r;
142
143 size_type cc = c;
144 if constexpr (!std::same_as<index_alphabet_type, seqan3::contrib::sdsl::plain_byte_alphabet>)
145 {
146 cc = csa.char2comp[c];
147 if (cc == 0 && c > 0) // [[unlikely]]
148 return false;
149 }
150
151 size_type const c_begin = csa.C[cc];
152 if (l == 0 && r + 1 == csa.size()) // [[unlikely]]
153 {
154 _l = c_begin;
155 _r = csa.C[cc + 1] - 1;
156 // if we use not the plain_byte_alphabet, we could return always return true here
157 }
158 else
159 {
160 _l = c_begin + csa.bwt.rank(l, c); // count c in bwt[0..l-1]
161 _r = c_begin + csa.bwt.rank(r + 1, c) - 1; // count c in bwt[0..r]
162 }
163
164 if (_r >= _l)
165 {
166 r = _r;
167 l = _l;
168 assert(r + 1 - l >= 0);
169 return true;
170 }
171 return false;
172 }
173
174public:
179 // Default construction is necessary to make this class semi-regular and e.g., to allow construction of
180 // std::array of iterators.
187
190 index(&_index),
191 node({0, _index.index.size() - 1, 0, 0}),
192 sigma(_index.index.sigma - index_t::text_layout_mode)
193 {
194 assert(_index.index.size() != 0);
195 }
196 //\}
197
210 bool operator==(fm_index_cursor const & rhs) const noexcept
211 {
212 assert(index != nullptr);
213 assert(node != rhs.node || (query_length() == 0 || (parent_lb == rhs.parent_lb && parent_rb == rhs.parent_rb)));
214
215 // position in the implicit suffix tree is defined by the SA interval and depth.
216 // No need to compare parent intervals
217 return node == rhs.node;
218 }
219
232 bool operator!=(fm_index_cursor const & rhs) const noexcept
233 {
234 assert(index != nullptr);
235
236 return !(*this == rhs);
237 }
238
257 {
258 // TODO: specialize extend_right() and cycle_back() for EPR-dictionaries
259 // store all cursors at once in a private std::array of cursors
260 assert(index != nullptr);
261
262 sdsl_sigma_type c = 1; // NOTE: start with 0 or 1 depending on implicit_sentintel
263 size_type _lb = node.lb, _rb = node.rb;
264 while (c < sigma && !backward_search(index->index, index->index.comp2char[c], _lb, _rb))
265 {
266 ++c;
267 }
268
269 if (c != sigma)
270 {
271 parent_lb = node.lb;
272 parent_rb = node.rb;
274 node = {_lb, _rb, node.depth + 1, static_cast<sdsl_char_type>(c)};
275 return true;
276 }
277 return false;
278 }
279
293 template <typename char_t>
294 requires std::convertible_to<char_t, index_alphabet_type>
295 bool extend_right(char_t const c) noexcept
296 {
297 assert(index != nullptr);
298 // The rank cannot exceed 255 for single text and 254 for text collections as they are reserved as sentinels
299 // for the indexed text.
300 assert(seqan3::to_rank(static_cast<index_alphabet_type>(c))
301 < ((index_type::text_layout_mode == text_layout::single) ? 255 : 254));
302
303 size_type _lb = node.lb, _rb = node.rb;
304
305 sdsl_char_type c_char = seqan3::to_rank(static_cast<index_alphabet_type>(c)) + 1;
306
307 if (backward_search(index->index, c_char, _lb, _rb))
308 {
309 parent_lb = node.lb;
310 parent_rb = node.rb;
311 node = {_lb, _rb, node.depth + 1, c_char};
312 return true;
313 }
314 return false;
315 }
316
318 template <typename char_type>
319 requires detail::is_char_adaptation_v<char_type>
320 bool extend_right(char_type const * cstring) noexcept
321 {
323 }
324
341 template <std::ranges::range seq_t>
342 bool extend_right(seq_t && seq) noexcept
343 {
344 static_assert(std::ranges::forward_range<seq_t>, "The query must model forward_range.");
345 static_assert(std::convertible_to<range_innermost_value_t<seq_t>, index_alphabet_type>,
346 "The alphabet of the sequence must be convertible to the alphabet of the index.");
347
348 assert(index != nullptr); // range must not be empty!
349
350 size_type _lb = node.lb, _rb = node.rb;
351 size_type new_parent_lb = parent_lb, new_parent_rb = parent_rb;
352
353 sdsl_char_type c{};
354 size_t len{0};
355
356 for (auto it = std::ranges::begin(seq); it != std::ranges::end(seq); ++len, ++it)
357 {
358 // The rank cannot exceed 255 for single text and 254 for text collections as they are reserved as sentinels
359 // for the indexed text.
360 assert(seqan3::to_rank(static_cast<index_alphabet_type>(*it))
361 < ((index_type::text_layout_mode == text_layout::single) ? 255 : 254));
362
363 c = seqan3::to_rank(static_cast<index_alphabet_type>(*it)) + 1;
364
367 if (!backward_search(index->index, c, _lb, _rb))
368 return false;
369 }
370
371 parent_lb = new_parent_lb;
372 parent_rb = new_parent_rb;
373 node = {_lb, _rb, len + node.depth, c};
374 return true;
375 }
376
404 {
405 assert(index != nullptr && query_length() > 0);
406 // parent_lb > parent_rb --> invalid interval
407 assert(parent_lb <= parent_rb);
408
409 sdsl_sigma_type c = node.last_char + 1;
410 size_type _lb = parent_lb, _rb = parent_rb;
411
412 while (c < sigma && !backward_search(index->index, index->index.comp2char[c], _lb, _rb))
413 {
414 ++c;
415 }
416
417 if (c != sigma) // Collection has additional sentinel as delimiter
418 {
420 node = {_lb, _rb, node.depth, static_cast<sdsl_char_type>(c)};
421 return true;
422 }
423 return false;
424 }
425
442 {
443 // parent_lb > parent_rb --> invalid interval
444 assert(index != nullptr && query_length() > 0 && parent_lb <= parent_rb);
445
446 return index->index.comp2char[node.last_char] - 1; // text is not allowed to contain ranks of 0
447 }
448
465 {
466 assert(index != nullptr);
467
468 return {node.lb, node.rb + 1};
469 }
470
490 {
491 assert(index != nullptr);
492 assert(node.depth != 0 || (node.lb == 0 && node.rb == index->size() - 1)); // depth == 0 -> root node
493
494 return node.depth;
495 }
496
514 template <std::ranges::range text_t>
515 auto path_label(text_t && text) const noexcept
516 requires (index_t::text_layout_mode == text_layout::single)
517 {
518 static_assert(std::ranges::input_range<text_t>, "The text must model input_range.");
519 static_assert(range_dimension_v<text_t> == 1, "The input cannot be a text collection.");
520 static_assert(std::same_as<range_innermost_value_t<text_t>, index_alphabet_type>,
521 "The alphabet types of the given text and index differ.");
522 assert(index != nullptr);
523
524 size_type const query_begin = offset() - index->index[node.lb];
526 }
527
529 template <std::ranges::range text_t>
530 auto path_label(text_t && text) const noexcept
531 requires (index_t::text_layout_mode == text_layout::collection)
532 {
533 static_assert(std::ranges::input_range<text_t>, "The text collection must model input_range.");
534 static_assert(range_dimension_v<text_t> == 2, "The input must be a text collection.");
535 static_assert(std::same_as<range_innermost_value_t<text_t>, index_alphabet_type>,
536 "The alphabet types of the given text and index differ.");
537 assert(index != nullptr);
538
539 // Position of query in concatenated text.
540 size_type const location = offset() - index->index[node.lb];
541
542 // The rank represents the number of start positions of the individual sequences/texts in the collection
543 // before position `location + 1` and thereby also the number of delimiters.
544 size_type const rank = index->text_begin_rs.rank(location + 1);
545 assert(rank > 0);
546 size_type const text_id = rank - 1;
547
548 // The start location of the `text_id`-th text in the sequence (position of the `rank`-th 1 in the bitvector).
549 size_type const start_location = index->text_begin_ss.select(rank);
550 // Substract lengths of previous sequences.
552
553 // Take subtext, slice query out of it
555 }
556
569 {
570 assert(index != nullptr);
571
572 return 1 + node.rb - node.lb;
573 }
574
587 requires (index_t::text_layout_mode == text_layout::single)
588 {
589 assert(index != nullptr);
590
592 occ.reserve(count());
593 for (size_type i = 0; i < count(); ++i)
594 {
595 occ.emplace_back(0, offset() - index->index[node.lb + i]);
596 }
597
598 return occ;
599 }
600
603 requires (index_t::text_layout_mode == text_layout::collection)
604 {
605 assert(index != nullptr);
606
608 occ.reserve(count());
609 for (size_type i = 0; i < count(); ++i)
610 {
611 size_type loc = offset() - index->index[node.lb + i];
612 size_type sequence_rank = index->text_begin_rs.rank(loc + 1);
613 size_type sequence_position = loc - index->text_begin_ss.select(sequence_rank);
614 occ.emplace_back(sequence_rank - 1, sequence_position);
615 }
616 return occ;
617 }
618
632 requires (index_t::text_layout_mode == text_layout::single)
633 {
634 assert(index != nullptr);
635
636 return std::views::iota(node.lb, node.lb + count())
637 | std::views::transform(
638 [*this, _offset = offset()](auto sa_pos)
639 {
640 return locate_result_value_type{0u, _offset - index->index[sa_pos]};
641 });
642 }
643
646 requires (index_t::text_layout_mode == text_layout::collection)
647 {
648 assert(index != nullptr);
649
650 return std::views::iota(node.lb, node.lb + count())
651 | std::views::transform(
652 [*this, _offset = offset()](auto sa_pos)
653 {
654 auto loc = _offset - index->index[sa_pos];
655 size_type sequence_rank = index->text_begin_rs.rank(loc + 1);
656 size_type sequence_position = loc - index->text_begin_ss.select(sequence_rank);
657 return locate_result_value_type{sequence_rank - 1, sequence_position};
658 });
659 }
660
668 template <cereal_archive archive_t>
670 {
671 archive(parent_lb);
672 archive(parent_rb);
673 archive(node);
674 archive(sigma);
675 }
677};
678
679} // namespace seqan3
Core alphabet concept and free function/type trait wrappers.
T begin(T... args)
Provides alphabet adaptations for standard char types.
A "pretty printer" for most SeqAn data structures and related types.
Definition debug_stream_type.hpp:79
The SeqAn FM Index Cursor.
Definition fm_index_cursor.hpp:83
bool extend_right(seq_t &&seq) noexcept
Tries to extend the query by seq to the right.
Definition fm_index_cursor.hpp:342
auto path_label(text_t &&text) const noexcept
Returns the searched query.
Definition fm_index_cursor.hpp:515
bool cycle_back() noexcept
Tries to replace the rightmost character of the query by the next lexicographically larger character ...
Definition fm_index_cursor.hpp:403
bool operator!=(fm_index_cursor const &rhs) const noexcept
Compares two cursors.
Definition fm_index_cursor.hpp:232
seqan3::suffix_array_interval suffix_array_interval() const noexcept
Returns the half-open suffix array interval.
Definition fm_index_cursor.hpp:464
locate_result_type locate() const
Locates the occurrences of the searched query in the text.
Definition fm_index_cursor.hpp:586
auto lazy_locate() const
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition fm_index_cursor.hpp:645
bool operator==(fm_index_cursor const &rhs) const noexcept
Compares two cursors.
Definition fm_index_cursor.hpp:210
size_type count() const noexcept
Counts the number of occurrences of the searched query in the text.
Definition fm_index_cursor.hpp:568
size_type last_rank() const noexcept
Outputs the rightmost rank.
Definition fm_index_cursor.hpp:441
bool extend_right() noexcept
Tries to extend the query by the smallest possible character to the right such that the query is foun...
Definition fm_index_cursor.hpp:256
bool extend_right(char_type const *cstring) noexcept
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition fm_index_cursor.hpp:320
locate_result_type locate() const
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition fm_index_cursor.hpp:602
auto path_label(text_t &&text) const noexcept
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition fm_index_cursor.hpp:530
size_type query_length() const noexcept
Returns the length of the searched query. .
Definition fm_index_cursor.hpp:489
index_t index_type
Type of the index.
Definition fm_index_cursor.hpp:89
typename index_type::size_type size_type
Type for representing positions in the indexed text.
Definition fm_index_cursor.hpp:91
auto lazy_locate() const
Locates the occurrences of the searched query in the text on demand, i.e. a std::ranges::view is retu...
Definition fm_index_cursor.hpp:631
bool extend_right(char_t const c) noexcept
Tries to extend the query by the character c to the right.
Definition fm_index_cursor.hpp:295
fm_index_cursor() noexcept=default
Default constructor. Accessing member functions on a default constructed object is undefined behavior...
Provides various transformation traits used by the range module.
Provides the internal representation of a node of the seqan3::fm_index_cursor.
constexpr auto to_rank
Return the rank representation of a (semi-)alphabet object.
Definition alphabet/concept.hpp:152
@ seq
The "sequence", usually a range of nucleotides or amino acids.
text_layout
The possible text layouts (single, collection) the seqan3::fm_index and seqan3::bi_fm_index can suppo...
Definition search/fm_index/concept.hpp:69
@ single
The text is a single range.
Definition search/fm_index/concept.hpp:71
@ collection
The text is a range of ranges.
Definition search/fm_index/concept.hpp:73
constexpr auto slice
A view adaptor that returns a half-open interval on the underlying range.
Definition slice.hpp:137
The main SeqAn3 namespace.
Definition aligned_sequence_concept.hpp:26
Provides the concept for seqan3::detail::sdsl_index.
Provides seqan3::views::slice.
The underlying suffix array interval.
Definition fm_index_cursor.hpp:29
size_t end_position
The exclusive end position of the interval ("right boundary").
Definition fm_index_cursor.hpp:33
size_t begin_position
The begin position of the interval ("left boundary").
Definition fm_index_cursor.hpp:31
friend bool operator==(suffix_array_interval const &lhs, suffix_array_interval const &rhs) noexcept
Test for equality.
Definition fm_index_cursor.hpp:43
friend bool operator!=(suffix_array_interval const &lhs, suffix_array_interval const &rhs) noexcept
Test for inequality.
Definition fm_index_cursor.hpp:53
Hide me