SeqAn3  3.0.3
The Modern C++ library for sequence analysis.
fm_index.hpp
Go to the documentation of this file.
1 // -----------------------------------------------------------------------------------------------------
2 // Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin
3 // Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik
4 // This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5 // shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6 // -----------------------------------------------------------------------------------------------------
7 
13 #pragma once
14 
15 #include <seqan3/std/algorithm>
16 #include <seqan3/std/filesystem>
17 #include <seqan3/std/ranges>
18 
19 #include <sdsl/suffix_trees.hpp>
20 
27 
28 namespace seqan3::detail
29 {
33 struct fm_index_validator
34 {
49  template <semialphabet alphabet_t, text_layout text_layout_mode_, std::ranges::range text_t>
50  static void validate(text_t && text)
51  {
52  if constexpr (text_layout_mode_ == text_layout::single)
53  {
54  static_assert(std::ranges::bidirectional_range<text_t>, "The text must model bidirectional_range.");
55  static_assert(std::convertible_to<range_innermost_value_t<text_t>, alphabet_t>,
56  "The alphabet of the text collection must be convertible to the alphabet of the index.");
57  static_assert(range_dimension_v<text_t> == 1, "The input cannot be a text collection.");
58 
59  if (std::ranges::empty(text))
60  throw std::invalid_argument("The text to index cannot be empty.");
61  }
62  else
63  {
64  static_assert(std::ranges::bidirectional_range<text_t>,
65  "The text collection must model bidirectional_range.");
66  static_assert(std::ranges::bidirectional_range<std::ranges::range_reference_t<text_t>>,
67  "The elements of the text collection must model bidirectional_range.");
68  static_assert(std::convertible_to<range_innermost_value_t<text_t>, alphabet_t>,
69  "The alphabet of the text collection must be convertible to the alphabet of the index.");
70  static_assert(range_dimension_v<text_t> == 2, "The input must be a text collection.");
71 
72  if (std::ranges::empty(text))
73  throw std::invalid_argument("The text collection to index cannot be empty.");
74  }
75  static_assert(alphabet_size<range_innermost_value_t<text_t>> <= 256, "The alphabet is too big.");
76  }
77 };
78 } // namespace seqan3::detail
79 
80 namespace seqan3
81 {
82 
84 // forward declarations
85 template <typename index_t>
86 class fm_index_cursor;
87 
88 template <typename index_t>
89 class bi_fm_index_cursor;
90 
91 namespace detail
92 {
93 template <semialphabet alphabet_t,
94  text_layout text_layout_mode_,
95  detail::sdsl_index sdsl_index_type_>
96 class reverse_fm_index;
97 }
99 
136  sdsl::csa_wt<sdsl::wt_blcd<sdsl::bit_vector, // Wavelet tree type
137  sdsl::rank_support_v<>,
138  sdsl::select_support_scan<>,
139  sdsl::select_support_scan<0>>,
140  16, // Sampling rate of the suffix array
141  10'000'000, // Sampling rate of the inverse suffix array
142  sdsl::sa_order_sa_sampling<>, // How to sample positions in the suffix array (text VS SA sampling)
143  sdsl::isa_sampling<>, // How to sample positons in the inverse suffix array
144  sdsl::plain_byte_alphabet>; // How to represent the alphabet
145 
151 
189 template <semialphabet alphabet_t,
190  text_layout text_layout_mode_,
191  detail::sdsl_index sdsl_index_type_ = default_sdsl_index_type>
192 class fm_index
193 {
194 private:
199  using sdsl_index_type = sdsl_index_type_;
203  using sdsl_char_type = typename sdsl_index_type::alphabet_type::char_type;
205  using sdsl_sigma_type = typename sdsl_index_type::alphabet_type::sigma_type;
207 
208  friend class detail::reverse_fm_index<alphabet_t, text_layout_mode_, sdsl_index_type_>;
209 
211  sdsl_index_type index;
212 
214  sdsl::sd_vector<> text_begin;
216  sdsl::select_support_sd<1> text_begin_ss;
218  sdsl::rank_support_sd<1> text_begin_rs;
219 
239  template <std::ranges::range text_t>
241  requires (text_layout_mode_ == text_layout::single)
243  void construct(text_t && text)
244  {
245  detail::fm_index_validator::validate<alphabet_t, text_layout_mode_>(text);
246 
247  constexpr auto sigma = alphabet_size<alphabet_t>;
248 
249  // TODO:
250  // * check what happens in sdsl when constructed twice!
251  // * choose between in-memory/external and construction algorithms
252  // * sdsl construction currently only works for int_vector, std::string and char *, not ranges in general
253  // uint8_t largest_char = 0;
254  sdsl::int_vector<8> tmp_text(std::ranges::distance(text));
255 
256  std::ranges::move(text
258  | std::views::transform([] (uint8_t const r)
259  {
260  if constexpr (sigma == 256)
261  {
262  if (r == 255)
263  throw std::out_of_range("The input text cannot be indexed, because for full"
264  "character alphabets the last one/two values are reserved"
265  "(single sequence/collection).");
266  }
267  return r + 1;
268  })
269  | std::views::reverse,
270  std::ranges::begin(tmp_text)); // reverse and increase rank by one
271 
272  sdsl::construct_im(index, tmp_text, 0);
273 
274  // TODO: would be nice but doesn't work since it's private and the public member references are const
275  // index.m_C.resize(largest_char);
276  // index.m_C.shrink_to_fit();
277  // index.m_sigma = largest_char;
278  }
279 
281  template <std::ranges::range text_t>
283  requires (text_layout_mode_ == text_layout::collection)
285  void construct(text_t && text, bool reverse = false)
286  {
287  detail::fm_index_validator::validate<alphabet_t, text_layout_mode_>(text);
288 
289  std::vector<size_t> text_sizes;
290 
291  for (auto && t : text)
292  text_sizes.push_back(std::ranges::distance(t));
293 
294  size_t const number_of_texts{text_sizes.size()};
295 
296  // text size including delimiters
297  size_t const text_size = std::accumulate(text_sizes.begin(), text_sizes.end(), 0) + number_of_texts;
298 
299  if (number_of_texts == text_size)
300  throw std::invalid_argument("A text collection that only contains empty texts cannot be indexed.");
301 
302  constexpr auto sigma = alphabet_size<alphabet_t>;
303 
304  // Instead of creating a bitvector of size `text_size`, setting the bits to 1 and then compressing it, we can
305  // use the `sd_vector_builder(text_size, number_of_ones)` because we know the parameters and the 1s we want to
306  // set are in a strictly increasing order. This inplace construction of the compressed vector saves memory.
307  sdsl::sd_vector_builder builder(text_size, number_of_texts);
308  size_t prefix_sum{0};
309 
310  for (auto && size : text_sizes)
311  {
312  builder.set(prefix_sum);
313  prefix_sum += size + 1;
314  }
315 
316  text_begin = sdsl::sd_vector<>(builder);
317  text_begin_ss = sdsl::select_support_sd<1>(&text_begin);
318  text_begin_rs = sdsl::rank_support_sd<1>(&text_begin);
319 
320  // last text in collection needs no delimiter if we have more than one text in the collection
321  sdsl::int_vector<8> tmp_text(text_size - (number_of_texts > 1));
322 
323  constexpr uint8_t delimiter = sigma >= 255 ? 255 : sigma + 1;
324 
325  std::ranges::move(text
326  | views::deep{views::to_rank}
327  | views::deep
328  {
329  std::views::transform([] (uint8_t const r)
330  {
331  if constexpr (sigma >= 255)
332  {
333  if (r >= 254)
334  throw std::out_of_range("The input text cannot be indexed, because"
335  " for full character alphabets the last one/"
336  "two values are reserved (single sequence/"
337  "collection).");
338  }
339  return r + 1;
340  })
341  }
342  | views::join_with(delimiter),
343  std::ranges::begin(tmp_text));
344 
345  if (!reverse)
346  {
347  // we need at least one delimiter
348  if (number_of_texts == 1)
349  tmp_text.back() = delimiter;
350 
351  std::ranges::reverse(tmp_text);
352  }
353  else
354  {
355  // If only one text is in the text collection, we still need one delimiter at the end to be able to
356  // conduct rank and select queries when locating hits in the index.
357  // Also, tmp_text looks like [text|0], but after reversing we need [txet|0] to be able to add the delimiter.
358  if (number_of_texts == 1)
359  {
360  std::ranges::reverse(tmp_text.begin(), tmp_text.end() - 1);
361  tmp_text.back() = delimiter;
362  }
363  else
364  {
365  std::ranges::reverse(tmp_text);
366  }
367  }
368 
369  sdsl::construct_im(index, tmp_text, 0);
370  }
371 
372 public:
374  static constexpr text_layout text_layout_mode = text_layout_mode_;
375 
380  using alphabet_type = alphabet_t;
382  using size_type = typename sdsl_index_type::size_type;
386 
387  template <typename bi_fm_index_t>
388  friend class bi_fm_index_cursor;
389 
390  template <typename fm_index_t>
391  friend class fm_index_cursor;
392 
393  template <typename fm_index_t>
394  friend class detail::fm_index_cursor_node;
395 
399  fm_index() = default;
400 
402  fm_index(fm_index const & rhs) :
403  index{rhs.index}, text_begin{rhs.text_begin}, text_begin_ss{rhs.text_begin_ss}, text_begin_rs{rhs.text_begin_rs}
404  {
405  text_begin_ss.set_vector(&text_begin);
406  text_begin_rs.set_vector(&text_begin);
407  }
408 
410  fm_index(fm_index && rhs) :
411  index{std::move(rhs.index)}, text_begin{std::move(rhs.text_begin)},text_begin_ss{std::move(rhs.text_begin_ss)},
412  text_begin_rs{std::move(rhs.text_begin_rs)}
413  {
414  text_begin_ss.set_vector(&text_begin);
415  text_begin_rs.set_vector(&text_begin);
416  }
417 
420  {
421  index = std::move(rhs.index);
422  text_begin = std::move(rhs.text_begin);
423  text_begin_ss = std::move(rhs.text_begin_ss);
424  text_begin_rs = std::move(rhs.text_begin_rs);
425 
426  text_begin_ss.set_vector(&text_begin);
427  text_begin_rs.set_vector(&text_begin);
428 
429  return *this;
430  }
431 
432  ~fm_index() = default;
433 
442  template <std::ranges::bidirectional_range text_t>
443  explicit fm_index(text_t && text)
444  {
445  construct(std::forward<text_t>(text));
446  }
448 
460  size_type size() const noexcept
461  {
462  return index.size();
463  }
464 
476  bool empty() const noexcept
477  {
478  return size() == 0;
479  }
480 
492  bool operator==(fm_index const & rhs) const noexcept
493  {
494  // (void) rhs;
495  return (index == rhs.index) && (text_begin == rhs.text_begin);
496  }
497 
509  bool operator!=(fm_index const & rhs) const noexcept
510  {
511  return !(*this == rhs);
512  }
513 
528  cursor_type cursor() const noexcept
529  {
530  return {*this};
531  }
532 
540  template <cereal_archive archive_t>
541  void CEREAL_SERIALIZE_FUNCTION_NAME(archive_t & archive)
542  {
543  archive(index);
544  archive(text_begin);
545  archive(text_begin_ss);
546  text_begin_ss.set_vector(&text_begin);
547  archive(text_begin_rs);
548  text_begin_rs.set_vector(&text_begin);
549 
550  auto sigma = alphabet_size<alphabet_t>;
551  archive(sigma);
552  if (sigma != alphabet_size<alphabet_t>)
553  {
554  throw std::logic_error{"The fm_index was built over an alphabet of size " + std::to_string(sigma) +
555  " but it is being read into an fm_index with an alphabet of size " +
556  std::to_string(alphabet_size<alphabet_t>) + "."};
557  }
558 
559  bool tmp = text_layout_mode;
560  archive(tmp);
561  if (tmp != text_layout_mode)
562  {
563  throw std::logic_error{std::string{"The fm_index was built over a "} +
564  (tmp ? "text collection" : "single text") +
565  " but it is being read into an fm_index expecting a " +
566  (text_layout_mode ? "text collection." : "single text.")};
567  }
568  }
570 
571 };
572 
577 template <std::ranges::range text_t>
578 fm_index(text_t &&) -> fm_index<range_innermost_value_t<text_t>, text_layout{range_dimension_v<text_t> != 1}>;
580 
582 } // namespace seqan3
583 
584 namespace seqan3::detail
585 {
586 
599 template <semialphabet alphabet_t,
600  text_layout text_layout_mode,
601  detail::sdsl_index sdsl_index_type = default_sdsl_index_type>
602 class reverse_fm_index : public fm_index<alphabet_t, text_layout_mode, sdsl_index_type>
603 {
604 private:
606  template <std::ranges::range text_t>
607  void construct_(text_t && text)
608  {
609  if constexpr (text_layout_mode == text_layout::single)
610  {
611  auto reverse_text = text | std::views::reverse;
612  this->construct(reverse_text);
613  }
614  else
615  {
616  auto reverse_text = text | views::deep{std::views::reverse} | std::views::reverse;
617  this->construct(reverse_text, true);
618  }
619  }
620 
621 public:
623 
625  template <std::ranges::bidirectional_range text_t>
626  explicit reverse_fm_index(text_t && text)
627  {
628  construct_(std::forward<text_t>(text));
629  }
630 
631 };
632 
633 } // namespace seqan3::detail
T accumulate(T... args)
Adaptations of algorithms from the Ranges TS.
Provides seqan3::views::to_rank.
T begin(T... args)
The SeqAn Bidirectional FM Index Cursor.
Definition: bi_fm_index_cursor.hpp:58
The SeqAn FM Index Cursor.
Definition: fm_index_cursor.hpp:90
The SeqAn FM Index.
Definition: fm_index.hpp:193
fm_index()=default
Defaulted.
static constexpr text_layout text_layout_mode
Indicates whether index is built over a collection.
Definition: fm_index.hpp:374
cursor_type cursor() const noexcept
Returns a seqan3::fm_index_cursor on the index that can be used for searching.
Definition: fm_index.hpp:528
size_type size() const noexcept
Returns the length of the indexed text including sentinel characters.
Definition: fm_index.hpp:460
bool operator!=(fm_index const &rhs) const noexcept
Compares two indices.
Definition: fm_index.hpp:509
bool empty() const noexcept
Checks whether the index is empty.
Definition: fm_index.hpp:476
bool operator==(fm_index const &rhs) const noexcept
Compares two indices.
Definition: fm_index.hpp:492
fm_index & operator=(fm_index rhs)
When copy/move assigning, also update internal data structures.
Definition: fm_index.hpp:419
~fm_index()=default
Defaulted.
alphabet_t alphabet_type
The type of the underlying character of the indexed text.
Definition: fm_index.hpp:380
fm_index(fm_index &&rhs)
When move constructing, also update internal data structures.
Definition: fm_index.hpp:410
typename sdsl_index_type::size_type size_type
Type for representing positions in the indexed text.
Definition: fm_index.hpp:382
fm_index(text_t &&text)
Constructor that immediately constructs the index given a range. The range cannot be empty.
Definition: fm_index.hpp:443
fm_index(fm_index const &rhs)
When copy constructing, also update internal data structures.
Definition: fm_index.hpp:402
Provides various transformation traits used by the range module.
Provides the internal representation of a node of the seqan3::fm_index_cursor.
T end(T... args)
This header includes C++17 filesystem support and imports it into namespace std::filesystem (independ...
Provides the seqan3::fm_index_cursor for searching in the unidirectional seqan3::fm_index.
constexpr auto alphabet_size
A type trait that holds the size of a (semi-)alphabet.
Definition: concept.hpp:858
text_layout
The possible text layouts (single, collection) the seqan3::fm_index and seqan3::bi_fm_index can suppo...
Definition: concept.hpp:81
sdsl_wt_index_type default_sdsl_index_type
The default FM Index Configuration.
Definition: fm_index.hpp:150
sdsl::csa_wt< sdsl::wt_blcd< sdsl::bit_vector, sdsl::rank_support_v<>, sdsl::select_support_scan<>, sdsl::select_support_scan< 0 > >, 16, 10 '000 '000, sdsl::sa_order_sa_sampling<>, sdsl::isa_sampling<>, sdsl::plain_byte_alphabet > sdsl_wt_index_type
The FM Index Configuration using a Wavelet Tree.
Definition: fm_index.hpp:144
fm_index(text_t &&) -> fm_index< range_innermost_value_t< text_t >, text_layout
Deduces the alphabet and dimensions of the text.
Definition: fm_index.hpp:578
@ single
The text is a single range.
Definition: concept.hpp:83
@ collection
The text is a range of ranges.
Definition: concept.hpp:85
decltype(detail::transform< trait_t >(list_t{})) transform
Apply a transformation trait to every type in the list and return a seqan3::type_list of the results.
Definition: traits.hpp:471
constexpr auto join_with
A join view, please use std::views::join if you don't need a separator.
Definition: join_with.hpp:37
auto const to_rank
A view that calls seqan3::to_rank() on each element in the input range.
Definition: to_rank.hpp:71
auto const move
A view that turns lvalue-references into rvalue-references.
Definition: move.hpp:74
The basis for seqan3::alphabet, but requires only rank interface (not char).
Provides seqan3::views::join_with.
The main SeqAn3 namespace.
Definition: aligned_sequence_concept.hpp:29
SeqAn specific customisations in the standard namespace.
T push_back(T... args)
Adaptations of concepts from the Ranges TS.
Provides the concept for seqan3::detail::sdsl_index.
T size(T... args)
T to_string(T... args)