SeqAn3 3.4.0-rc.1
The Modern C++ library for sequence analysis.
Loading...
Searching...
No Matches
fm_index.hpp
Go to the documentation of this file.
1// SPDX-FileCopyrightText: 2006-2024 Knut Reinert & Freie Universität Berlin
2// SPDX-FileCopyrightText: 2016-2024 Knut Reinert & MPI für molekulare Genetik
3// SPDX-License-Identifier: BSD-3-Clause
4
10#pragma once
11
12#include <algorithm>
13#include <filesystem>
14#include <ranges>
15
16#include <sdsl/suffix_trees.hpp>
17
23
24namespace seqan3::detail
25{
29struct fm_index_validator
30{
45 template <semialphabet alphabet_t, text_layout text_layout_mode_, std::ranges::range text_t>
46 static void validate(text_t && text)
47 {
48 if constexpr (text_layout_mode_ == text_layout::single)
49 {
50 static_assert(std::ranges::bidirectional_range<text_t>, "The text must model bidirectional_range.");
51 static_assert(std::convertible_to<range_innermost_value_t<text_t>, alphabet_t>,
52 "The alphabet of the text collection must be convertible to the alphabet of the index.");
53 static_assert(range_dimension_v<text_t> == 1, "The input cannot be a text collection.");
54
55 if (std::ranges::empty(text))
56 throw std::invalid_argument("The text to index cannot be empty.");
57 }
58 else
59 {
60 static_assert(std::ranges::bidirectional_range<text_t>,
61 "The text collection must model bidirectional_range.");
62 static_assert(std::ranges::bidirectional_range<std::ranges::range_reference_t<text_t>>,
63 "The elements of the text collection must model bidirectional_range.");
64 static_assert(std::convertible_to<range_innermost_value_t<text_t>, alphabet_t>,
65 "The alphabet of the text collection must be convertible to the alphabet of the index.");
66 static_assert(range_dimension_v<text_t> == 2, "The input must be a text collection.");
67
68 if (std::ranges::empty(text))
69 throw std::invalid_argument("The text collection to index cannot be empty.");
70 }
71 static_assert(alphabet_size<range_innermost_value_t<text_t>> <= 256, "The alphabet is too big.");
72 }
73};
74} // namespace seqan3::detail
75
76namespace seqan3
77{
78
80// forward declarations
81template <typename index_t>
82class fm_index_cursor;
83
84template <typename index_t>
85class bi_fm_index_cursor;
86
87namespace detail
88{
89template <semialphabet alphabet_t, text_layout text_layout_mode_, detail::sdsl_index sdsl_index_type_>
90class reverse_fm_index;
91}
93
127 sdsl::csa_wt<sdsl::wt_blcd<sdsl::bit_vector, // Wavelet tree type
128 sdsl::rank_support_v<>,
129 sdsl::select_support_scan<>,
130 sdsl::select_support_scan<0>>,
131 16, // Sampling rate of the suffix array
132 10'000'000, // Sampling rate of the inverse suffix array
133 sdsl::sa_order_sa_sampling<>, // How to sample positions in the suffix array (text VS SA sampling)
134 sdsl::isa_sampling<>, // How to sample positons in the inverse suffix array
135 sdsl::plain_byte_alphabet>; // How to represent the alphabet
136
143
182template <semialphabet alphabet_t,
183 text_layout text_layout_mode_,
184 detail::sdsl_index sdsl_index_type_ = default_sdsl_index_type>
186{
187private:
192 using sdsl_index_type = sdsl_index_type_;
196 using sdsl_char_type = typename sdsl_index_type::alphabet_type::char_type;
198 using sdsl_sigma_type = typename sdsl_index_type::alphabet_type::sigma_type;
200
201 friend class detail::reverse_fm_index<alphabet_t, text_layout_mode_, sdsl_index_type_>;
202
204 sdsl_index_type index;
205
207 sdsl::sd_vector<> text_begin;
209 sdsl::select_support_sd<1> text_begin_ss;
211 sdsl::rank_support_sd<1> text_begin_rs;
212
214 template <typename output_it_t, typename sequence_t>
215 static output_it_t copy_sequence_ranks_shifted_by_one(output_it_t output_it, sequence_t && sequence)
216 {
217 constexpr size_t sigma = alphabet_size<alphabet_t>;
218 constexpr size_t max_sigma = text_layout_mode_ == text_layout::single ? 256u : 255u;
219
220 constexpr auto warn_if_rank_out_of_range = [](uint8_t const rank)
221 {
222 if (rank >= max_sigma - 1) // same as rank + 1 >= max_sigma but without overflow
223 throw std::out_of_range("The input text cannot be indexed, because for full"
224 "character alphabets the last one/two values are reserved"
225 "(single sequence/collection).");
226 };
227
229 output_it,
230 [&warn_if_rank_out_of_range](auto const & chr)
231 {
232 uint8_t const rank = seqan3::to_rank(chr);
233 if constexpr (sigma >= max_sigma)
234 warn_if_rank_out_of_range(rank);
235 else
236 (void)warn_if_rank_out_of_range;
237 return rank + 1;
238 })
239 .out;
240 }
241
261 template <std::ranges::range text_t>
262 requires (text_layout_mode_ == text_layout::single)
263 void construct(text_t && text)
264 {
265 detail::fm_index_validator::validate<alphabet_t, text_layout_mode_>(text);
266
267 // TODO:
268 // * check what happens in sdsl when constructed twice!
269 // * choose between in-memory/external and construction algorithms
270 // * sdsl construction currently only works for int_vector, std::string and char *, not ranges in general
271 // uint8_t largest_char = 0;
272 sdsl::int_vector<8> tmp_text(std::ranges::distance(text));
273
274 // copy ranks into tmp_text
275 copy_sequence_ranks_shifted_by_one(std::ranges::begin(tmp_text), text | std::views::reverse);
276
277 sdsl::construct_im(index, tmp_text, 0);
278
279 // TODO: would be nice but doesn't work since it's private and the public member references are const
280 // index.m_C.resize(largest_char);
281 // index.m_C.shrink_to_fit();
282 // index.m_sigma = largest_char;
283 }
284
286 template <std::ranges::range text_t>
287 requires (text_layout_mode_ == text_layout::collection)
288 void construct(text_t && text, bool reverse = false)
289 {
290 detail::fm_index_validator::validate<alphabet_t, text_layout_mode_>(text);
291
292 std::vector<size_t> text_sizes;
293
294 for (auto && t : text)
295 text_sizes.push_back(std::ranges::distance(t));
296
297 size_t const number_of_texts{text_sizes.size()};
298
299 // text size including delimiters
300 size_t const text_size = std::accumulate(text_sizes.begin(), text_sizes.end(), number_of_texts);
301
302 if (number_of_texts == text_size)
303 throw std::invalid_argument("A text collection that only contains empty texts cannot be indexed.");
304
305 constexpr auto sigma = alphabet_size<alphabet_t>;
306
307 // Instead of creating a bitvector of size `text_size`, setting the bits to 1 and then compressing it, we can
308 // use the `sd_vector_builder(text_size, number_of_ones)` because we know the parameters and the 1s we want to
309 // set are in a strictly increasing order. This inplace construction of the compressed vector saves memory.
310 sdsl::sd_vector_builder builder(text_size, number_of_texts);
311 size_t prefix_sum{0};
312
313 for (auto && size : text_sizes)
314 {
315 builder.set(prefix_sum);
316 prefix_sum += size + 1;
317 }
318
319 text_begin = sdsl::sd_vector<>(builder);
320 text_begin_ss = sdsl::select_support_sd<1>(&text_begin);
321 text_begin_rs = sdsl::rank_support_sd<1>(&text_begin);
322
323 // last text in collection needs no delimiter if we have more than one text in the collection
324 sdsl::int_vector<8> tmp_text(text_size - (number_of_texts > 1));
325
326 constexpr uint8_t delimiter = sigma >= 255 ? 255 : sigma + 1;
327
328 auto copy_join_with = [](auto output_it, auto && collection)
329 {
330 // this is basically std::views::join() with a delimiter
331 auto collection_it = std::ranges::begin(collection);
332 auto const collection_sentinel = std::ranges::end(collection);
333 if (collection_it == collection_sentinel)
334 return;
335
336 output_it = copy_sequence_ranks_shifted_by_one(output_it, *collection_it);
337 ++collection_it;
338
339 for (; collection_it != collection_sentinel; ++collection_it)
340 {
341 *output_it = delimiter;
342 ++output_it;
343 output_it = copy_sequence_ranks_shifted_by_one(output_it, *collection_it);
344 }
345 };
346
347 // copy ranks into tmp_text
348 copy_join_with(std::ranges::begin(tmp_text), text);
349
350 if (!reverse)
351 {
352 // we need at least one delimiter
353 if (number_of_texts == 1)
354 tmp_text.back() = delimiter;
355
356 std::ranges::reverse(tmp_text);
357 }
358 else
359 {
360 // If only one text is in the text collection, we still need one delimiter at the end to be able to
361 // conduct rank and select queries when locating hits in the index.
362 // Also, tmp_text looks like [text|0], but after reversing we need [txet|0] to be able to add the delimiter.
363 if (number_of_texts == 1)
364 {
365 std::ranges::reverse(tmp_text.begin(), tmp_text.end() - 1);
366 tmp_text.back() = delimiter;
367 }
368 else
369 {
370 std::ranges::reverse(tmp_text);
371 }
372 }
373
374 sdsl::construct_im(index, tmp_text, 0);
375 }
376
377public:
379 static constexpr text_layout text_layout_mode = text_layout_mode_;
380
385 using alphabet_type = alphabet_t;
387 using size_type = typename sdsl_index_type::size_type;
391
392 template <typename bi_fm_index_t>
393 friend class bi_fm_index_cursor;
394
395 template <typename fm_index_t>
396 friend class fm_index_cursor;
397
398 template <typename fm_index_t>
399 friend struct detail::fm_index_cursor_node;
400
404 fm_index() = default;
405
407 fm_index(fm_index const & rhs) :
408 index{rhs.index},
409 text_begin{rhs.text_begin},
410 text_begin_ss{rhs.text_begin_ss},
411 text_begin_rs{rhs.text_begin_rs}
412 {
413 text_begin_ss.set_vector(&text_begin);
414 text_begin_rs.set_vector(&text_begin);
415 }
416
419 index{std::move(rhs.index)},
420 text_begin{std::move(rhs.text_begin)},
421 text_begin_ss{std::move(rhs.text_begin_ss)},
422 text_begin_rs{std::move(rhs.text_begin_rs)}
423 {
424 text_begin_ss.set_vector(&text_begin);
425 text_begin_rs.set_vector(&text_begin);
426 }
427
430 {
431 index = std::move(rhs.index);
432 text_begin = std::move(rhs.text_begin);
433 text_begin_ss = std::move(rhs.text_begin_ss);
434 text_begin_rs = std::move(rhs.text_begin_rs);
435
436 text_begin_ss.set_vector(&text_begin);
437 text_begin_rs.set_vector(&text_begin);
438
439 return *this;
440 }
441
442 ~fm_index() = default;
443
452 template <std::ranges::bidirectional_range text_t>
453 explicit fm_index(text_t && text)
454 {
455 construct(std::forward<text_t>(text));
456 }
458
470 size_type size() const noexcept
471 {
472 return index.size();
473 }
474
486 bool empty() const noexcept
487 {
488 return size() == 0;
489 }
490
502 bool operator==(fm_index const & rhs) const noexcept
503 {
504 // (void) rhs;
505 return (index == rhs.index) && (text_begin == rhs.text_begin);
506 }
507
519 bool operator!=(fm_index const & rhs) const noexcept
520 {
521 return !(*this == rhs);
522 }
523
538 cursor_type cursor() const noexcept
539 {
540 return {*this};
541 }
542
550 template <cereal_archive archive_t>
551 void CEREAL_SERIALIZE_FUNCTION_NAME(archive_t & archive)
552 {
553 archive(index);
554 archive(text_begin);
555 archive(text_begin_ss);
556 text_begin_ss.set_vector(&text_begin);
557 archive(text_begin_rs);
558 text_begin_rs.set_vector(&text_begin);
559
560 auto sigma = alphabet_size<alphabet_t>;
561 archive(sigma);
562 if (sigma != alphabet_size<alphabet_t>)
563 {
564 throw std::logic_error{"The fm_index was built over an alphabet of size " + std::to_string(sigma)
565 + " but it is being read into an fm_index with an alphabet of size "
566 + std::to_string(alphabet_size<alphabet_t>) + "."};
567 }
568
569 bool tmp = text_layout_mode;
570 archive(tmp);
571 if (tmp != text_layout_mode)
572 {
573 throw std::logic_error{std::string{"The fm_index was built over a "}
574 + (tmp ? "text collection" : "single text")
575 + " but it is being read into an fm_index expecting a "
576 + (text_layout_mode ? "text collection." : "single text.")};
577 }
578 }
580};
581
586template <std::ranges::range text_t>
587fm_index(text_t &&) -> fm_index<range_innermost_value_t<text_t>, text_layout{range_dimension_v<text_t> != 1}>;
589} // namespace seqan3
590
591namespace seqan3::detail
592{
593
607template <semialphabet alphabet_t,
608 text_layout text_layout_mode,
609 detail::sdsl_index sdsl_index_type = default_sdsl_index_type>
610class reverse_fm_index : public fm_index<alphabet_t, text_layout_mode, sdsl_index_type>
611{
612private:
614 template <std::ranges::range text_t>
615 void construct_(text_t && text)
616 {
617 if constexpr (text_layout_mode == text_layout::single)
618 {
619 auto reverse_text = text | std::views::reverse;
620 this->construct(reverse_text);
621 }
622 else
623 {
624 auto reverse_text = text | views::deep{std::views::reverse} | std::views::reverse;
625 this->construct(reverse_text, true);
626 }
627 }
628
629public:
630 using fm_index<alphabet_t, text_layout_mode, sdsl_index_type>::fm_index;
631
633 template <std::ranges::bidirectional_range text_t>
634 explicit reverse_fm_index(text_t && text)
635 {
636 construct_(std::forward<text_t>(text));
637 }
638};
639
640} // namespace seqan3::detail
T accumulate(T... args)
T begin(T... args)
The SeqAn Bidirectional FM Index Cursor.
Definition bi_fm_index_cursor.hpp:52
The SeqAn FM Index Cursor.
Definition fm_index_cursor.hpp:84
The SeqAn FM Index.
Definition fm_index.hpp:186
fm_index & operator=(fm_index rhs)
When copy/move assigning, also update internal data structures.
Definition fm_index.hpp:429
fm_index()=default
Defaulted.
static constexpr text_layout text_layout_mode
Indicates whether index is built over a collection.
Definition fm_index.hpp:379
cursor_type cursor() const noexcept
Returns a seqan3::fm_index_cursor on the index that can be used for searching. .
Definition fm_index.hpp:538
size_type size() const noexcept
Returns the length of the indexed text including sentinel characters.
Definition fm_index.hpp:470
bool operator!=(fm_index const &rhs) const noexcept
Compares two indices.
Definition fm_index.hpp:519
bool empty() const noexcept
Checks whether the index is empty.
Definition fm_index.hpp:486
bool operator==(fm_index const &rhs) const noexcept
Compares two indices.
Definition fm_index.hpp:502
~fm_index()=default
Defaulted.
alphabet_t alphabet_type
The type of the underlying character of the indexed text.
Definition fm_index.hpp:385
fm_index(fm_index &&rhs)
When move constructing, also update internal data structures.
Definition fm_index.hpp:418
typename sdsl_index_type::size_type size_type
Type for representing positions in the indexed text.
Definition fm_index.hpp:387
fm_index(text_t &&text)
Constructor that immediately constructs the index given a range. The range cannot be empty.
Definition fm_index.hpp:453
fm_index(fm_index const &rhs)
When copy constructing, also update internal data structures.
Definition fm_index.hpp:407
Provides various transformation traits used by the range module.
Provides the internal representation of a node of the seqan3::fm_index_cursor.
T end(T... args)
Provides the seqan3::fm_index_cursor for searching in the unidirectional seqan3::fm_index.
constexpr auto to_rank
Return the rank representation of a (semi-)alphabet object.
Definition alphabet/concept.hpp:152
text_layout
The possible text layouts (single, collection) the seqan3::fm_index and seqan3::bi_fm_index can suppo...
Definition search/fm_index/concept.hpp:70
sdsl_wt_index_type default_sdsl_index_type
The default FM Index Configuration.
Definition fm_index.hpp:142
sdsl::csa_wt< sdsl::wt_blcd< sdsl::bit_vector, sdsl::rank_support_v<>, sdsl::select_support_scan<>, sdsl::select_support_scan< 0 > >, 16, 10 '000 '000, sdsl::sa_order_sa_sampling<>, sdsl::isa_sampling<>, sdsl::plain_byte_alphabet > sdsl_wt_index_type
The FM Index Configuration using a Wavelet Tree.
Definition fm_index.hpp:135
@ single
The text is a single range.
Definition search/fm_index/concept.hpp:72
@ collection
The text is a range of ranges.
Definition search/fm_index/concept.hpp:74
The basis for seqan3::alphabet, but requires only rank interface (not char).
The generic concept for a (biological) sequence.
The main SeqAn3 namespace.
Definition aligned_sequence_concept.hpp:26
fm_index(text_t &&) -> fm_index< range_innermost_value_t< text_t >, text_layout
Deduces the alphabet and dimensions of the text.
Definition fm_index.hpp:587
SeqAn specific customisations in the standard namespace.
T push_back(T... args)
T reverse(T... args)
Provides the concept for seqan3::detail::sdsl_index.
T size(T... args)
Provides seqan3::views::to_rank.
T to_string(T... args)
T transform(T... args)
Hide me