SeqAn3 3.1.0
The Modern C++ library for sequence analysis.
fm_index.hpp
Go to the documentation of this file.
1// -----------------------------------------------------------------------------------------------------
2// Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin
3// Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik
4// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6// -----------------------------------------------------------------------------------------------------
7
13#pragma once
14
15#include <seqan3/std/algorithm>
16#include <seqan3/std/filesystem>
17#include <seqan3/std/ranges>
18
19#include <sdsl/suffix_trees.hpp>
20
27
28namespace seqan3::detail
29{
33struct fm_index_validator
34{
49 template <semialphabet alphabet_t, text_layout text_layout_mode_, std::ranges::range text_t>
50 static void validate(text_t && text)
51 {
52 if constexpr (text_layout_mode_ == text_layout::single)
53 {
54 static_assert(std::ranges::bidirectional_range<text_t>, "The text must model bidirectional_range.");
55 static_assert(std::convertible_to<range_innermost_value_t<text_t>, alphabet_t>,
56 "The alphabet of the text collection must be convertible to the alphabet of the index.");
57 static_assert(range_dimension_v<text_t> == 1, "The input cannot be a text collection.");
58
59 if (std::ranges::empty(text))
60 throw std::invalid_argument("The text to index cannot be empty.");
61 }
62 else
63 {
64 static_assert(std::ranges::bidirectional_range<text_t>,
65 "The text collection must model bidirectional_range.");
66 static_assert(std::ranges::bidirectional_range<std::ranges::range_reference_t<text_t>>,
67 "The elements of the text collection must model bidirectional_range.");
68 static_assert(std::convertible_to<range_innermost_value_t<text_t>, alphabet_t>,
69 "The alphabet of the text collection must be convertible to the alphabet of the index.");
70 static_assert(range_dimension_v<text_t> == 2, "The input must be a text collection.");
71
72 if (std::ranges::empty(text))
73 throw std::invalid_argument("The text collection to index cannot be empty.");
74 }
75 static_assert(alphabet_size<range_innermost_value_t<text_t>> <= 256, "The alphabet is too big.");
76 }
77};
78} // namespace seqan3::detail
79
80namespace seqan3
81{
82
84// forward declarations
85template <typename index_t>
86class fm_index_cursor;
87
88template <typename index_t>
89class bi_fm_index_cursor;
90
91namespace detail
92{
93template <semialphabet alphabet_t,
94 text_layout text_layout_mode_,
95 detail::sdsl_index sdsl_index_type_>
96class reverse_fm_index;
97}
99
133 sdsl::csa_wt<sdsl::wt_blcd<sdsl::bit_vector, // Wavelet tree type
134 sdsl::rank_support_v<>,
135 sdsl::select_support_scan<>,
136 sdsl::select_support_scan<0>>,
137 16, // Sampling rate of the suffix array
138 10'000'000, // Sampling rate of the inverse suffix array
139 sdsl::sa_order_sa_sampling<>, // How to sample positions in the suffix array (text VS SA sampling)
140 sdsl::isa_sampling<>, // How to sample positons in the inverse suffix array
141 sdsl::plain_byte_alphabet>; // How to represent the alphabet
142
149
188template <semialphabet alphabet_t,
189 text_layout text_layout_mode_,
190 detail::sdsl_index sdsl_index_type_ = default_sdsl_index_type>
192{
193private:
198 using sdsl_index_type = sdsl_index_type_;
202 using sdsl_char_type = typename sdsl_index_type::alphabet_type::char_type;
204 using sdsl_sigma_type = typename sdsl_index_type::alphabet_type::sigma_type;
206
207 friend class detail::reverse_fm_index<alphabet_t, text_layout_mode_, sdsl_index_type_>;
208
210 sdsl_index_type index;
211
213 sdsl::sd_vector<> text_begin;
215 sdsl::select_support_sd<1> text_begin_ss;
217 sdsl::rank_support_sd<1> text_begin_rs;
218
238 template <std::ranges::range text_t>
240 requires (text_layout_mode_ == text_layout::single)
242 void construct(text_t && text)
243 {
244 detail::fm_index_validator::validate<alphabet_t, text_layout_mode_>(text);
245
246 constexpr auto sigma = alphabet_size<alphabet_t>;
247
248 // TODO:
249 // * check what happens in sdsl when constructed twice!
250 // * choose between in-memory/external and construction algorithms
251 // * sdsl construction currently only works for int_vector, std::string and char *, not ranges in general
252 // uint8_t largest_char = 0;
253 sdsl::int_vector<8> tmp_text(std::ranges::distance(text));
254
255 std::ranges::move(text
257 | std::views::transform([] (uint8_t const r)
258 {
259 if constexpr (sigma == 256)
260 {
261 if (r == 255)
262 throw std::out_of_range("The input text cannot be indexed, because for full"
263 "character alphabets the last one/two values are reserved"
264 "(single sequence/collection).");
265 }
266 return r + 1;
267 })
268 | std::views::reverse,
269 std::ranges::begin(tmp_text)); // reverse and increase rank by one
270
271 sdsl::construct_im(index, tmp_text, 0);
272
273 // TODO: would be nice but doesn't work since it's private and the public member references are const
274 // index.m_C.resize(largest_char);
275 // index.m_C.shrink_to_fit();
276 // index.m_sigma = largest_char;
277 }
278
280 template <std::ranges::range text_t>
282 requires (text_layout_mode_ == text_layout::collection)
284 void construct(text_t && text, bool reverse = false)
285 {
286 detail::fm_index_validator::validate<alphabet_t, text_layout_mode_>(text);
287
288 std::vector<size_t> text_sizes;
289
290 for (auto && t : text)
291 text_sizes.push_back(std::ranges::distance(t));
292
293 size_t const number_of_texts{text_sizes.size()};
294
295 // text size including delimiters
296 size_t const text_size = std::accumulate(text_sizes.begin(), text_sizes.end(), number_of_texts);
297
298 if (number_of_texts == text_size)
299 throw std::invalid_argument("A text collection that only contains empty texts cannot be indexed.");
300
301 constexpr auto sigma = alphabet_size<alphabet_t>;
302
303 // Instead of creating a bitvector of size `text_size`, setting the bits to 1 and then compressing it, we can
304 // use the `sd_vector_builder(text_size, number_of_ones)` because we know the parameters and the 1s we want to
305 // set are in a strictly increasing order. This inplace construction of the compressed vector saves memory.
306 sdsl::sd_vector_builder builder(text_size, number_of_texts);
307 size_t prefix_sum{0};
308
309 for (auto && size : text_sizes)
310 {
311 builder.set(prefix_sum);
312 prefix_sum += size + 1;
313 }
314
315 text_begin = sdsl::sd_vector<>(builder);
316 text_begin_ss = sdsl::select_support_sd<1>(&text_begin);
317 text_begin_rs = sdsl::rank_support_sd<1>(&text_begin);
318
319 // last text in collection needs no delimiter if we have more than one text in the collection
320 sdsl::int_vector<8> tmp_text(text_size - (number_of_texts > 1));
321
322 constexpr uint8_t delimiter = sigma >= 255 ? 255 : sigma + 1;
323
324 std::ranges::move(text
325 | views::deep{views::to_rank}
326 | views::deep
327 {
328 std::views::transform([] (uint8_t const r)
329 {
330 if constexpr (sigma >= 255)
331 {
332 if (r >= 254)
333 throw std::out_of_range("The input text cannot be indexed, because"
334 " for full character alphabets the last one/"
335 "two values are reserved (single sequence/"
336 "collection).");
337 }
338 return r + 1;
339 })
340 }
341 | views::join_with(delimiter),
342 std::ranges::begin(tmp_text));
343
344 if (!reverse)
345 {
346 // we need at least one delimiter
347 if (number_of_texts == 1)
348 tmp_text.back() = delimiter;
349
350 std::ranges::reverse(tmp_text);
351 }
352 else
353 {
354 // If only one text is in the text collection, we still need one delimiter at the end to be able to
355 // conduct rank and select queries when locating hits in the index.
356 // Also, tmp_text looks like [text|0], but after reversing we need [txet|0] to be able to add the delimiter.
357 if (number_of_texts == 1)
358 {
359 std::ranges::reverse(tmp_text.begin(), tmp_text.end() - 1);
360 tmp_text.back() = delimiter;
361 }
362 else
363 {
364 std::ranges::reverse(tmp_text);
365 }
366 }
367
368 sdsl::construct_im(index, tmp_text, 0);
369 }
370
371public:
373 static constexpr text_layout text_layout_mode = text_layout_mode_;
374
379 using alphabet_type = alphabet_t;
381 using size_type = typename sdsl_index_type::size_type;
385
386 template <typename bi_fm_index_t>
387 friend class bi_fm_index_cursor;
388
389 template <typename fm_index_t>
390 friend class fm_index_cursor;
391
392 template <typename fm_index_t>
393 friend class detail::fm_index_cursor_node;
394
398 fm_index() = default;
399
401 fm_index(fm_index const & rhs) :
402 index{rhs.index}, text_begin{rhs.text_begin}, text_begin_ss{rhs.text_begin_ss}, text_begin_rs{rhs.text_begin_rs}
403 {
404 text_begin_ss.set_vector(&text_begin);
405 text_begin_rs.set_vector(&text_begin);
406 }
407
410 index{std::move(rhs.index)}, text_begin{std::move(rhs.text_begin)},text_begin_ss{std::move(rhs.text_begin_ss)},
411 text_begin_rs{std::move(rhs.text_begin_rs)}
412 {
413 text_begin_ss.set_vector(&text_begin);
414 text_begin_rs.set_vector(&text_begin);
415 }
416
419 {
420 index = std::move(rhs.index);
421 text_begin = std::move(rhs.text_begin);
422 text_begin_ss = std::move(rhs.text_begin_ss);
423 text_begin_rs = std::move(rhs.text_begin_rs);
424
425 text_begin_ss.set_vector(&text_begin);
426 text_begin_rs.set_vector(&text_begin);
427
428 return *this;
429 }
430
431 ~fm_index() = default;
432
441 template <std::ranges::bidirectional_range text_t>
442 explicit fm_index(text_t && text)
443 {
444 construct(std::forward<text_t>(text));
445 }
447
459 size_type size() const noexcept
460 {
461 return index.size();
462 }
463
475 bool empty() const noexcept
476 {
477 return size() == 0;
478 }
479
491 bool operator==(fm_index const & rhs) const noexcept
492 {
493 // (void) rhs;
494 return (index == rhs.index) && (text_begin == rhs.text_begin);
495 }
496
508 bool operator!=(fm_index const & rhs) const noexcept
509 {
510 return !(*this == rhs);
511 }
512
527 cursor_type cursor() const noexcept
528 {
529 return {*this};
530 }
531
539 template <cereal_archive archive_t>
540 void CEREAL_SERIALIZE_FUNCTION_NAME(archive_t & archive)
541 {
542 archive(index);
543 archive(text_begin);
544 archive(text_begin_ss);
545 text_begin_ss.set_vector(&text_begin);
546 archive(text_begin_rs);
547 text_begin_rs.set_vector(&text_begin);
548
549 auto sigma = alphabet_size<alphabet_t>;
550 archive(sigma);
551 if (sigma != alphabet_size<alphabet_t>)
552 {
553 throw std::logic_error{"The fm_index was built over an alphabet of size " + std::to_string(sigma) +
554 " but it is being read into an fm_index with an alphabet of size " +
555 std::to_string(alphabet_size<alphabet_t>) + "."};
556 }
557
558 bool tmp = text_layout_mode;
559 archive(tmp);
560 if (tmp != text_layout_mode)
561 {
562 throw std::logic_error{std::string{"The fm_index was built over a "} +
563 (tmp ? "text collection" : "single text") +
564 " but it is being read into an fm_index expecting a " +
565 (text_layout_mode ? "text collection." : "single text.")};
566 }
567 }
569
570};
571
576template <std::ranges::range text_t>
577fm_index(text_t &&) -> fm_index<range_innermost_value_t<text_t>, text_layout{range_dimension_v<text_t> != 1}>;
579} // namespace seqan3
580
581namespace seqan3::detail
582{
583
597template <semialphabet alphabet_t,
598 text_layout text_layout_mode,
599 detail::sdsl_index sdsl_index_type = default_sdsl_index_type>
600class reverse_fm_index : public fm_index<alphabet_t, text_layout_mode, sdsl_index_type>
601{
602private:
604 template <std::ranges::range text_t>
605 void construct_(text_t && text)
606 {
607 if constexpr (text_layout_mode == text_layout::single)
608 {
609 auto reverse_text = text | std::views::reverse;
610 this->construct(reverse_text);
611 }
612 else
613 {
614 auto reverse_text = text | views::deep{std::views::reverse} | std::views::reverse;
615 this->construct(reverse_text, true);
616 }
617 }
618
619public:
621
623 template <std::ranges::bidirectional_range text_t>
624 explicit reverse_fm_index(text_t && text)
625 {
626 construct_(std::forward<text_t>(text));
627 }
628
629};
630
631} // namespace seqan3::detail
T accumulate(T... args)
The <algorithm> header from C++20's standard library.
T begin(T... args)
The SeqAn Bidirectional FM Index Cursor.
Definition: bi_fm_index_cursor.hpp:55
The SeqAn FM Index Cursor.
Definition: fm_index_cursor.hpp:87
The SeqAn FM Index.
Definition: fm_index.hpp:192
fm_index & operator=(fm_index rhs)
When copy/move assigning, also update internal data structures.
Definition: fm_index.hpp:418
fm_index()=default
Defaulted.
static constexpr text_layout text_layout_mode
Indicates whether index is built over a collection.
Definition: fm_index.hpp:373
cursor_type cursor() const noexcept
Returns a seqan3::fm_index_cursor on the index that can be used for searching. .
Definition: fm_index.hpp:527
size_type size() const noexcept
Returns the length of the indexed text including sentinel characters.
Definition: fm_index.hpp:459
bool operator!=(fm_index const &rhs) const noexcept
Compares two indices.
Definition: fm_index.hpp:508
bool empty() const noexcept
Checks whether the index is empty.
Definition: fm_index.hpp:475
bool operator==(fm_index const &rhs) const noexcept
Compares two indices.
Definition: fm_index.hpp:491
~fm_index()=default
Defaulted.
alphabet_t alphabet_type
The type of the underlying character of the indexed text.
Definition: fm_index.hpp:379
fm_index(fm_index &&rhs)
When move constructing, also update internal data structures.
Definition: fm_index.hpp:409
typename sdsl_index_type::size_type size_type
Type for representing positions in the indexed text.
Definition: fm_index.hpp:381
fm_index(text_t &&text)
Constructor that immediately constructs the index given a range. The range cannot be empty.
Definition: fm_index.hpp:442
fm_index(fm_index const &rhs)
When copy constructing, also update internal data structures.
Definition: fm_index.hpp:401
Provides various transformation traits used by the range module.
Provides the internal representation of a node of the seqan3::fm_index_cursor.
T end(T... args)
The <filesystem> header from C++17's standard library.
Provides the seqan3::fm_index_cursor for searching in the unidirectional seqan3::fm_index.
auto const to_rank
A view that calls seqan3::to_rank() on each element in the input range.
Definition: to_rank.hpp:66
text_layout
The possible text layouts (single, collection) the seqan3::fm_index and seqan3::bi_fm_index can suppo...
Definition: concept.hpp:72
sdsl_wt_index_type default_sdsl_index_type
The default FM Index Configuration.
Definition: fm_index.hpp:148
sdsl::csa_wt< sdsl::wt_blcd< sdsl::bit_vector, sdsl::rank_support_v<>, sdsl::select_support_scan<>, sdsl::select_support_scan< 0 > >, 16, 10 '000 '000, sdsl::sa_order_sa_sampling<>, sdsl::isa_sampling<>, sdsl::plain_byte_alphabet > sdsl_wt_index_type
The FM Index Configuration using a Wavelet Tree.
Definition: fm_index.hpp:141
@ single
The text is a single range.
Definition: concept.hpp:74
@ collection
The text is a range of ranges.
Definition: concept.hpp:76
decltype(detail::transform< trait_t >(list_t{})) transform
Apply a transformation trait to every type in the list and return a seqan3::type_list of the results.
Definition: traits.hpp:471
constexpr auto join_with
A join view, please use std::views::join if you don't need a separator.
Definition: join_with.hpp:29
The basis for seqan3::alphabet, but requires only rank interface (not char).
Provides seqan3::views::join_with.
The main SeqAn3 namespace.
Definition: cigar_operation_table.hpp:2
fm_index(text_t &&) -> fm_index< range_innermost_value_t< text_t >, text_layout
Deduces the alphabet and dimensions of the text.
Definition: fm_index.hpp:577
SeqAn specific customisations in the standard namespace.
T push_back(T... args)
The <ranges> header from C++20's standard library.
Provides the concept for seqan3::detail::sdsl_index.
T size(T... args)
Provides seqan3::views::to_rank.
T to_string(T... args)