SeqAn3  3.0.1
The Modern C++ library for sequence analysis.
fm_index.hpp
Go to the documentation of this file.
1 // -----------------------------------------------------------------------------------------------------
2 // Copyright (c) 2006-2020, Knut Reinert & Freie Universität Berlin
3 // Copyright (c) 2016-2020, Knut Reinert & MPI für molekulare Genetik
4 // This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5 // shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6 // -----------------------------------------------------------------------------------------------------
7 
13 #pragma once
14 
15 #include <sdsl/suffix_trees.hpp>
16 
18 #include <seqan3/std/filesystem>
27 #include <seqan3/std/algorithm>
28 #include <seqan3/std/ranges>
29 
30 namespace seqan3
31 {
32 
34 // forward declarations
35 template <typename index_t>
36 class fm_index_cursor;
37 
38 template <typename index_t>
39 class bi_fm_index_cursor;
41 
77 using sdsl_wt_index_type =
78  sdsl::csa_wt<sdsl::wt_blcd<sdsl::bit_vector,
79  sdsl::rank_support_v<>,
80  sdsl::select_support_scan<>,
81  sdsl::select_support_scan<0>>,
82  16,
83  10000000,
84  sdsl::sa_order_sa_sampling<>,
85  sdsl::isa_sampling<>,
87 
93 
131 template <semialphabet alphabet_t,
132  text_layout text_layout_mode_,
133  detail::sdsl_index sdsl_index_type_ = default_sdsl_index_type>
134 class fm_index
135 {
136 private:
140  using sdsl_index_type = sdsl_index_type_;
145  using sdsl_char_type = typename sdsl_index_type::alphabet_type::char_type;
147  using sdsl_sigma_type = typename sdsl_index_type::alphabet_type::sigma_type;
149 
151  sdsl_index_type index;
152 
154  sdsl::sd_vector<> text_begin;
156  sdsl::select_support_sd<1> text_begin_ss;
158  sdsl::rank_support_sd<1> text_begin_rs;
159 
179  template <std::ranges::range text_t>
181  requires text_layout_mode_ == text_layout::single
183  void construct(text_t && text)
184  {
185  static_assert(std::ranges::bidirectional_range<text_t>, "The text must model bidirectional_range.");
186  static_assert(alphabet_size<innermost_value_type_t<text_t>> <= 256, "The alphabet is too big.");
187  static_assert(std::convertible_to<innermost_value_type_t<text_t>, alphabet_t>,
188  "The alphabet of the text collection must be convertible to the alphabet of the index.");
189  static_assert(dimension_v<text_t> == 1, "The input cannot be a text collection.");
190 
191  // text must not be empty
192  if (std::ranges::begin(text) == std::ranges::end(text))
193  throw std::invalid_argument("The text that is indexed cannot be empty.");
194 
195  constexpr auto sigma = alphabet_size<alphabet_t>;
196 
197  // TODO:
198  // * check what happens in sdsl when constructed twice!
199  // * choose between in-memory/external and construction algorithms
200  // * sdsl construction currently only works for int_vector, std::string and char *, not ranges in general
201  // uint8_t largest_char = 0;
202  sdsl::int_vector<8> tmp_text(std::ranges::distance(text));
203 
204  std::ranges::move(text
206  | std::views::transform([] (uint8_t const r)
207  {
208  if constexpr (sigma == 256)
209  {
210  if (r == 255)
211  throw std::out_of_range("The input text cannot be indexed, because for full"
212  "character alphabets the last one/two values are reserved"
213  "(single sequence/collection).");
214  }
215  return r + 1;
216  })
217  | std::views::reverse,
218  std::ranges::begin(tmp_text)); // reverse and increase rank by one
219 
220  sdsl::construct_im(index, tmp_text, 0);
221 
222  // TODO: would be nice but doesn't work since it's private and the public member references are const
223  // index.m_C.resize(largest_char);
224  // index.m_C.shrink_to_fit();
225  // index.m_sigma = largest_char;
226  }
227 
229  template <std::ranges::range text_t>
231  requires text_layout_mode_ == text_layout::collection
233  void construct(text_t && text)
234  {
235  static_assert(std::ranges::bidirectional_range<text_t>, "The text collection must model bidirectional_range.");
236  static_assert(std::ranges::bidirectional_range<reference_t<text_t>>,
237  "The elements of the text collection must model bidirectional_range.");
238  static_assert(alphabet_size<innermost_value_type_t<text_t>> <= 256, "The alphabet is too big.");
239  static_assert(std::convertible_to<innermost_value_type_t<text_t>, alphabet_t>,
240  "The alphabet of the text collection must be convertible to the alphabet of the index.");
241  static_assert(dimension_v<text_t> == 2, "The input must be a text collection.");
242 
243  // text collection must not be empty
244  if (std::ranges::begin(text) == std::ranges::end(text))
245  throw std::invalid_argument("The text that is indexed cannot be empty.");
246 
247  size_t text_size{0}; // text size including delimiters
248 
249  // there must be at least one non-empty text in the collection
250  bool all_empty = true;
251 
252  for (auto && t : text)
253  {
254  if (std::ranges::begin(t) != std::ranges::end(t))
255  {
256  all_empty = false;
257  }
258  text_size += 1 + std::ranges::distance(t); // text size and delimiter (sum will be 1 for empty texts)
259  }
260 
261  if (all_empty)
262  throw std::invalid_argument("A text collection that only contains empty texts cannot be indexed.");
263 
264  constexpr auto sigma = alphabet_size<alphabet_t>;
265 
266  // Instead of creating a bitvector of size `text_size`, setting the bits to 1 and then compressing it, we can
267  // use the `sd_vector_builder(text_size, number_of_ones)` because we know the parameters and the 1s we want to
268  // set are in a strictly increasing order. This inplace construction of the compressed vector saves memory.
269  sdsl::sd_vector_builder builder(text_size, std::ranges::distance(text));
270  size_t prefix_sum{0};
271 
272  for (auto && t : text)
273  {
274  builder.set(prefix_sum);
275  prefix_sum += std::ranges::distance(t) + 1;
276  }
277 
278  text_begin = sdsl::sd_vector<>(builder);
279  text_begin_ss = sdsl::select_support_sd<1>(&text_begin);
280  text_begin_rs = sdsl::rank_support_sd<1>(&text_begin);
281 
282  // last text in collection needs no delimiter if we have more than one text in the collection
283  sdsl::int_vector<8> tmp_text(text_size - (std::ranges::distance(text) > 1));
284 
285  constexpr uint8_t delimiter = sigma >= 255 ? 255 : sigma + 1;
286 
287 
288  std::ranges::move(text
289  | views::deep{views::to_rank}
290  | views::deep
291  {
292  std::views::transform([] (uint8_t const r)
293  {
294  if constexpr (sigma >= 255)
295  {
296  if (r >= 254)
297  throw std::out_of_range("The input text cannot be indexed, because"
298  " for full character alphabets the last one/"
299  "two values are reserved (single sequence/"
300  "collection).");
301  }
302  return r + 1;
303  })
304  }
305  | views::join(delimiter),
306  std::ranges::begin(tmp_text));
307 
308 
309  // we need at least one delimiter
310  if (std::ranges::distance(text) == 1)
311  tmp_text.back() = delimiter;
312 
313  std::ranges::reverse(tmp_text);
314 
315  sdsl::construct_im(index, tmp_text, 0);
316  }
317 
318 public:
320  static constexpr text_layout text_layout_mode = text_layout_mode_;
321 
325  using alphabet_type = alphabet_t;
328  using size_type = typename sdsl_index_type::size_type;
332 
333  template <typename bi_fm_index_t>
334  friend class bi_fm_index_cursor;
335 
336  template <typename fm_index_t>
337  friend class fm_index_cursor;
338 
339  template <typename fm_index_t>
340  friend class detail::fm_index_cursor_node;
341 
345  fm_index() = default;
346 
348  fm_index(fm_index const & rhs) :
349  index{rhs.index}, text_begin{rhs.text_begin}, text_begin_ss{rhs.text_begin_ss}, text_begin_rs{rhs.text_begin_rs}
350  {
351  text_begin_ss.set_vector(&text_begin);
352  text_begin_rs.set_vector(&text_begin);
353  }
354 
356  fm_index(fm_index && rhs) :
357  index{std::move(rhs.index)}, text_begin{std::move(rhs.text_begin)},text_begin_ss{std::move(rhs.text_begin_ss)},
358  text_begin_rs{std::move(rhs.text_begin_rs)}
359  {
360  text_begin_ss.set_vector(&text_begin);
361  text_begin_rs.set_vector(&text_begin);
362  }
363 
366  {
367  index = std::move(rhs.index);
368  text_begin = std::move(rhs.text_begin);
369  text_begin_ss = std::move(rhs.text_begin_ss);
370  text_begin_rs = std::move(rhs.text_begin_rs);
371 
372  text_begin_ss.set_vector(&text_begin);
373  text_begin_rs.set_vector(&text_begin);
374 
375  return *this;
376  }
377 
378  ~fm_index() = default;
379 
388  template <std::ranges::range text_t>
389  fm_index(text_t && text)
390  {
391  construct(std::forward<text_t>(text));
392  }
394 
406  size_type size() const noexcept
407  {
408  return index.size();
409  }
410 
422  bool empty() const noexcept
423  {
424  return size() == 0;
425  }
426 
438  bool operator==(fm_index const & rhs) const noexcept
439  {
440  // (void) rhs;
441  return (index == rhs.index) && (text_begin == rhs.text_begin);
442  }
443 
455  bool operator!=(fm_index const & rhs) const noexcept
456  {
457  return !(*this == rhs);
458  }
459 
474  cursor_type begin() const noexcept
475  {
476  return {*this};
477  }
478 
486  template <cereal_archive archive_t>
487  void CEREAL_SERIALIZE_FUNCTION_NAME(archive_t & archive)
488  {
489  archive(index);
490  archive(text_begin);
491  archive(text_begin_ss);
492  text_begin_ss.set_vector(&text_begin);
493  archive(text_begin_rs);
494  text_begin_rs.set_vector(&text_begin);
495 
496  auto sigma = alphabet_size<alphabet_t>;
497  archive(sigma);
498  if (sigma != alphabet_size<alphabet_t>)
499  {
500  throw std::logic_error{"The fm_index was built over an alphabet of size " + std::to_string(sigma) +
501  " but it is being read into an fm_index with an alphabet of size " +
502  std::to_string(alphabet_size<alphabet_t>) + "."};
503  }
504 
505  bool tmp = text_layout_mode;
506  archive(tmp);
507  if (tmp != text_layout_mode)
508  {
509  throw std::logic_error{std::string{"The fm_index was built over a "} +
510  (tmp ? "text collection" : "single text") +
511  " but it is being read into an fm_index expecting a " +
512  (text_layout_mode ? "text collection." : "single text.")};
513  }
514  }
516 
517 };
518 
522 template <std::ranges::range text_t>
524 fm_index(text_t &&) -> fm_index<innermost_value_type_t<text_t>, text_layout{dimension_v<text_t> != 1}>;
526 
528 } // namespace seqan3
seqan3::single
The text is a single range.
Definition: concept.hpp:84
shortcuts.hpp
Provides various shortcuts for common std::ranges functions.
std::string
seqan3::fm_index::empty
bool empty() const noexcept
Checks whether the index is empty.
Definition: fm_index.hpp:422
seqan3::fm_index
The SeqAn FM Index.
Definition: fm_index.hpp:134
seqan3::reference_t
typename reference< t >::type reference_t
Shortcut for seqan3::reference (transformation_trait shortcut).
Definition: pre.hpp:77
seqan3::fm_index::fm_index
fm_index()=default
Defaulted.
fm_index_cursor.hpp
Provides the internal representation of a node of the seqan3::fm_index_cursor.
seqan3::fm_index::fm_index
fm_index(fm_index const &rhs)
When copy constructing, also update internal data structures.
Definition: fm_index.hpp:348
seqan3::views::move
const auto move
A view that turns lvalue-references into rvalue-references.
Definition: move.hpp:68
seqan3::views::to_rank
const auto to_rank
A view that calls seqan3::to_rank() on each element in the input range.
Definition: to_rank.hpp:67
filesystem
This header includes C++17 filesystem support and imports it into namespace seqan3::filesystem (indep...
seqan3::fm_index::text_layout_mode
static constexpr text_layout text_layout_mode
Indicates whether index is built over a collection.
Definition: fm_index.hpp:320
algorithm
Adaptations of algorithms from the Ranges TS.
seqan3::fm_index::fm_index
fm_index(text_t &&text)
Constructor that immediately constructs the index given a range. The range cannot be empty.
Definition: fm_index.hpp:389
seqan3::text_layout
text_layout
The possible text layouts (single, collection) the seqan3::fm_index and seqan3::bi_fm_index can suppo...
Definition: concept.hpp:81
seqan3::collection
The text is a range of ranges.
Definition: concept.hpp:86
to.hpp
Provides seqan3::views::to.
seqan3::fm_index< alphabet_t, text_layout_mode_, sdsl_index_type >::alphabet_type
alphabet_t alphabet_type
The type of the underlying character of the indexed text.
Definition: fm_index.hpp:326
seqan3::fm_index::operator!=
bool operator!=(fm_index const &rhs) const noexcept
Compares two indices.
Definition: fm_index.hpp:455
range.hpp
Provides various transformation traits used by the range module.
fm_index_cursor.hpp
Provides the seqan3::fm_index_cursor for searching in the unidirectional seqan3::fm_index.
std::to_string
T to_string(T... args)
seqan3
The main SeqAn3 namespace.
Definition: aligned_sequence_concept.hpp:36
seqan3::fm_index< alphabet_t, text_layout_mode_, sdsl_index_type >::size_type
typename sdsl_index_type::size_type size_type
Type for representing positions in the indexed text.
Definition: fm_index.hpp:328
std::logic_error
std::invalid_argument
convertible_to
The concept std::convertible_to<From, To> specifies that an expression of the type and value category...
seqan3::fm_index::size
size_type size() const noexcept
Returns the length of the indexed text including sentinel characters.
Definition: fm_index.hpp:406
seqan3::fm_index::~fm_index
~fm_index()=default
Defaulted.
seqan3::fm_index::fm_index
fm_index(fm_index &&rhs)
When move constructing, also update internal data structures.
Definition: fm_index.hpp:356
seqan3::default_sdsl_index_type
sdsl_wt_index_type default_sdsl_index_type
The default FM Index Configuration.
Definition: fm_index.hpp:92
join.hpp
Provides seqan3::views::join.
ranges
Adaptations of concepts from the Ranges TS.
sdsl::plain_byte_alphabet
Byte alphabet that does no mapping of char_type to comp_char_type and vice versa.
Definition: csa_alphabet_strategy.hpp:37
seqan3::bi_fm_index_cursor
The SeqAn Bidirectional FM Index Cursor.
Definition: bi_fm_index_cursor.hpp:55
seqan3::fm_index::operator=
fm_index & operator=(fm_index rhs)
When copy/move assigning, also update internal data structures.
Definition: fm_index.hpp:365
concept.hpp
Provides the concepts for seqan3::fm_index and seqan3::bi_fm_index and its traits and cursors.
std::out_of_range
seqan3::innermost_value_type_t
typename innermost_value_type< t >::type innermost_value_type_t
Shortcut for seqan3::innermost_value_type (transformation_trait shortcut).
Definition: range.hpp:191
seqan3::fm_index_cursor
The SeqAn FM Index Cursor.
Definition: fm_index_cursor.hpp:59
seqan3::fm_index
fm_index(text_t &&) -> fm_index< innermost_value_type_t< text_t >, text_layout
Deduces the alphabet and dimensions of the text.
Definition: fm_index.hpp:524
seqan3::pack_traits::transform
seqan3::type_list< trait_t< pack_t >... > transform
Apply a transformation trait to every type in the pack and return a seqan3::type_list of the results.
Definition: traits.hpp:307
seqan3::fm_index::begin
cursor_type begin() const noexcept
Returns a seqan3::fm_index_cursor on the index that can be used for searching.
Definition: fm_index.hpp:474
semialphabet
The basis for seqan3::alphabet, but requires only rank interface (not char).
csa_alphabet_strategy.hpp
Provides an alphabet mapping that implements an identity map (i.e. each character is mapped to its ra...
seqan3::alphabet_size
constexpr auto alphabet_size
A type trait that holds the size of a (semi-)alphabet.
Definition: concept.hpp:706
seqan3::sdsl_wt_index_type
sdsl::csa_wt< sdsl::wt_blcd< sdsl::bit_vector, sdsl::rank_support_v<>, sdsl::select_support_scan<>, sdsl::select_support_scan< 0 > >, 16, 10000000, sdsl::sa_order_sa_sampling<>, sdsl::isa_sampling<>, sdsl::plain_byte_alphabet > sdsl_wt_index_type
The FM Index Configuration using a Wavelet Tree.
Definition: fm_index.hpp:86
seqan3::fm_index::operator==
bool operator==(fm_index const &rhs) const noexcept
Compares two indices.
Definition: fm_index.hpp:438
to_rank.hpp
Provides seqan3::views::to_rank.