SeqAn3  3.0.0
The Modern C++ library for sequence analysis.
fm_index.hpp
Go to the documentation of this file.
1 // -----------------------------------------------------------------------------------------------------
2 // Copyright (c) 2006-2019, Knut Reinert & Freie Universität Berlin
3 // Copyright (c) 2016-2019, Knut Reinert & MPI für molekulare Genetik
4 // This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5 // shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6 // -----------------------------------------------------------------------------------------------------
7 
13 #pragma once
14 
15 #include <sdsl/suffix_trees.hpp>
16 
18 #include <seqan3/std/filesystem>
25 #include <seqan3/std/algorithm>
26 #include <seqan3/std/ranges>
27 
28 namespace seqan3
29 {
30 
32 // forward declarations
33 template <typename index_t>
34 class fm_index_cursor;
35 
36 template <typename index_t>
37 class bi_fm_index_cursor;
39 
75 using sdsl_wt_index_type =
76  sdsl::csa_wt<sdsl::wt_blcd<sdsl::bit_vector,
77  sdsl::rank_support_v<>,
78  sdsl::select_support_scan<>,
79  sdsl::select_support_scan<0>>,
80  16,
81  10000000,
82  sdsl::sa_order_sa_sampling<>,
83  sdsl::isa_sampling<>,
85 
91 
127 template <bool is_collection = false, detail::SdslIndex sdsl_index_type_ = default_sdsl_index_type>
128 class fm_index
129 {
130 protected:
132  size_t sigma{0};
134  static constexpr bool is_collection_{is_collection};
135 
139  using sdsl_index_type = sdsl_index_type_;
144  using sdsl_char_type = typename sdsl_index_type::alphabet_type::char_type;
146  using sdsl_sigma_type = typename sdsl_index_type::alphabet_type::sigma_type;
148 
151 
153  sdsl::sd_vector<> text_begin;
155  sdsl::select_support_sd<1> text_begin_ss;
157  sdsl::rank_support_sd<1> text_begin_rs;
158 
159 public:
163  using size_type = typename sdsl_index_type::size_type;
168 
169  template <typename bi_fm_index_t>
170  friend class bi_fm_index_cursor;
171 
172  template <typename fm_index_t>
173  friend class fm_index_cursor;
174 
175  template <typename fm_index_t>
176  friend class detail::fm_index_cursor_node;
177 
181  fm_index() = default;
182  fm_index(fm_index const &) = default;
183  fm_index & operator=(fm_index const &) = default;
184  fm_index(fm_index &&) = default;
185  fm_index & operator=(fm_index &&) = default;
186  ~fm_index() = default;
187 
196  template <std::ranges::Range text_t>
197  fm_index(text_t && text)
198  {
199  construct(std::forward<text_t>(text));
200  }
202 
222  template <std::ranges::Range text_t>
223  void construct(text_t && text)
225  requires !is_collection_
227  {
228  static_assert(std::ranges::BidirectionalRange<text_t>, "The text must model BidirectionalRange.");
229  static_assert(alphabet_size<innermost_value_type_t<text_t>> <= 256, "The alphabet is too big.");
230  static_assert(dimension_v<text_t> == 1, "The input cannot be a text collection.");
231 
232  // text must not be empty
233  if (std::ranges::begin(text) == std::ranges::end(text))
234  throw std::invalid_argument("The text that is indexed cannot be empty.");
235 
236  constexpr auto cexpr_sigma = alphabet_size<innermost_value_type_t<text_t>>;
237  sigma = cexpr_sigma;
238  // TODO:
239  // * check what happens in sdsl when constructed twice!
240  // * choose between in-memory/external and construction algorithms
241  // * sdsl construction currently only works for int_vector, std::string and char *, not ranges in general
242  // uint8_t largest_char = 0;
243  sdsl::int_vector<8> tmp_text(text.size());
244 
245  std::ranges::copy(text
246  | view::to_rank
247  | std::view::transform([] (uint8_t const r)
248  {
249  if constexpr (cexpr_sigma == 256)
250  {
251  if (r == 255)
252  throw std::out_of_range("The input text cannot be indexed, because for full"
253  "character alphabets the last one/two values are reserved"
254  "(single sequence/collection).");
255  }
256  return r + 1;
257  })
259  seqan3::begin(tmp_text)); // reverse and increase rank by one
260 
261  sdsl::construct_im(index, tmp_text, 0);
262 
263  // TODO: would be nice but doesn't work since it's private and the public member references are const
264  // index.m_C.resize(largest_char);
265  // index.m_C.shrink_to_fit();
266  // index.m_sigma = largest_char;
267  }
268 
270  template <std::ranges::Range text_t>
271  void construct(text_t && text)
273  requires is_collection_
275  {
276  static_assert(std::ranges::BidirectionalRange<text_t>, "The text collection must model BidirectionalRange.");
278  "The elements of the text collection must model BidirectionalRange.");
279  static_assert(alphabet_size<innermost_value_type_t<text_t>> <= 256, "The alphabet is too big.");
280  static_assert(dimension_v<text_t> == 2, "The input must be a text collection.");
281 
282  // text collection must not be empty
283  if (std::ranges::begin(text) == std::ranges::end(text))
284  throw std::invalid_argument("The text that is indexed cannot be empty.");
285 
286  size_t text_size{0}; // text size including delimiters
287 
288  // there must be at least one non-empty text in the collection
289  bool all_empty = true;
290 
291  for (auto && t : text)
292  {
294  {
295  all_empty = false;
296  }
297  text_size += 1 + t.size(); // text size and delimiter (sum will be 1 for empty texts)
298  }
299 
300  if (all_empty)
301  throw std::invalid_argument("A text collection that only contains empty texts cannot be indexed.");
302 
303  constexpr auto cexpr_sigma = alphabet_size<innermost_value_type_t<text_t>>;
304  sigma = cexpr_sigma;
305 
306  // bitvector where 1 marks the begin position of a single text from the collection in the concatenated text
307  sdsl::bit_vector pos(text_size, 0);
308  size_t prefix_sum{0};
309 
310  for (auto && t : text)
311  {
312  pos[prefix_sum] = 1;
313  prefix_sum += t.size() + 1;
314  }
315 
316  text_begin = sdsl::sd_vector(pos);
317  text_begin_ss = sdsl::select_support_sd<1>(&text_begin);
318  text_begin_rs = sdsl::rank_support_sd<1>(&text_begin);
319 
320  sdsl::int_vector<8> tmp_text(text_size - 1); // last text in collection needs no delimiter
321 
322  constexpr uint8_t delimiter = cexpr_sigma >= 255 ? 255 : cexpr_sigma + 1;
323 
324  std::vector<uint8_t> tmp = text
325  | view::deep{view::to_rank}
326  | view::deep
327  {
328  std::view::transform([] (uint8_t const r)
329  {
330  if constexpr (cexpr_sigma >= 255)
331  {
332  if (r >= 254)
333  throw std::out_of_range("The input text cannot be indexed, because"
334  " for full character alphabets the last one/"
335  "two values are reserved (single sequence/"
336  "collection).");
337  }
338  return r + 1;
339  })
340  }
341  | std::view::join(delimiter);
342 
344 
346  // std::ranges::copy(text
347  // | view::deep{view::to_rank}
348  // | view::deep{std::view::transform([] (uint8_t const r) { return r + 1; })} // increase rank
349  // | view::deep{std::view::reverse}
350  // | std::view::reverse
351  // | std::view::join(delimiter), // join with delimiter
352  // seqan3::begin(tmp_text));
353 
354  sdsl::construct_im(index, tmp_text, 0);
355  }
356 
368  size_type size() const noexcept
369  {
370  return index.size();
371  }
372 
384  bool empty() const noexcept
385  {
386  return size() == 0;
387  }
388 
400  bool operator==(fm_index const & rhs) const noexcept
401  {
402  // (void) rhs;
403  return (index == rhs.index) && (text_begin == rhs.text_begin);
404  }
405 
417  bool operator!=(fm_index const & rhs) const noexcept
418  {
419  return !(*this == rhs);
420  }
421 
436  cursor_type begin() const noexcept
437  {
438  return {*this};
439  }
440 
448  template <CerealArchive archive_t>
449  void CEREAL_SERIALIZE_FUNCTION_NAME(archive_t & archive)
450  {
451  archive(index);
452  archive(text_begin);
453  archive(text_begin_ss);
454  text_begin_ss.set_vector(&text_begin);
455  archive(text_begin_rs);
456  text_begin_rs.set_vector(&text_begin);
457  archive(sigma);
458  bool tmp = is_collection_;
459  archive(tmp);
460  assert(tmp == is_collection_);
461  }
463 
464 };
465 
469 template <std::ranges::Range text_t>
471 fm_index(text_t &&) -> fm_index<dimension_v<text_t> != 1>;
473 
475 } // namespace seqan3
fm_index(text_t &&) -> fm_index< dimension_v< text_t > !=1 >
Deduces the dimensions of the text.
sdsl::csa_wt< sdsl::wt_blcd< sdsl::bit_vector, sdsl::rank_support_v<>, sdsl::select_support_scan<>, sdsl::select_support_scan< 0 > >, 16, 10000000, sdsl::sa_order_sa_sampling<>, sdsl::isa_sampling<>, sdsl::plain_byte_alphabet > sdsl_wt_index_type
The FM Index Configuration using a Wavelet Tree.
Definition: fm_index.hpp:84
Provides the concepts for seqan3::fm_index and seqan3::bi_fm_index and its traits and cursors...
size_type size() const noexcept
Returns the length of the indexed text including sentinel characters.
Definition: fm_index.hpp:368
Provides various shortcuts for common std::ranges functions.
constexpr auto reverse
A range adaptor that presents the underlying range in reverse order.
Definition: ranges:721
size_t sigma
The alphabet size of the text.
Definition: fm_index.hpp:132
bool empty() const noexcept
Checks whether the index is empty.
Definition: fm_index.hpp:384
Provides an alphabet mapping that implements an identity map (i.e. each character is mapped to its ra...
sdsl_index_type index
Underlying index from the SDSL.
Definition: fm_index.hpp:150
The main SeqAn3 namespace.
sdsl::select_support_sd< 1 > text_begin_ss
Select support for text_begin.
Definition: fm_index.hpp:155
constexpr auto join
Flattens a View of ranges into a View.
Definition: ranges:683
sdsl::rank_support_sd< 1 > text_begin_rs
Rank support for text_begin.
Definition: fm_index.hpp:157
bool operator==(fm_index const &rhs) const noexcept
Compares two indices.
Definition: fm_index.hpp:400
Provides seqan3::view::to_rank.
Specifies requirements of a Range type for which begin returns a type that models std::BidirectionalI...
static constexpr bool is_collection_
Indicates whether index is built over a collection.
Definition: fm_index.hpp:134
void construct(text_t &&text)
Constructs the index given a range. The range cannot be an rvalue (i.e. a temporary object) and has t...
Definition: fm_index.hpp:223
bool operator!=(fm_index const &rhs) const noexcept
Compares two indices.
Definition: fm_index.hpp:417
sdsl_index_type_ sdsl_index_type
The type of the underlying SDSL index.
Definition: fm_index.hpp:140
Adaptations of concepts from the Ranges TS.
fm_index()=default
Default constructor.
typename sdsl_index_type::alphabet_type::sigma_type sdsl_sigma_type
The type of the alphabet size of the underlying SDSL index.
Definition: fm_index.hpp:146
typename innermost_value_type< t >::type innermost_value_type_t
Shortcut for seqan3::innermost_value_type (TransformationTrait shortcut).
Definition: range.hpp:191
::ranges::begin begin
Alias for ranges::begin. Returns an iterator to the beginning of a range.
Definition: ranges:174
::ranges::copy copy
Alias for ranges::copy. Copies a range of elements to a new location.
Definition: algorithm:44
fm_index(text_t &&text)
Constructor that immediately constructs the index given a range. The range cannot be empty...
Definition: fm_index.hpp:197
fm_index & operator=(fm_index const &)=default
Copy assignment.
sdsl_wt_index_type default_sdsl_index_type
The default FM Index Configuration.
Definition: fm_index.hpp:90
The SeqAn Bidirectional FM Index Cursor.
Definition: bi_fm_index_cursor.hpp:54
Byte alphabet that does no mapping of char_type to comp_char_type and vice versa. ...
Definition: csa_alphabet_strategy.hpp:35
cursor_type begin() const noexcept
Returns a seqan3::fm_index_cursor on the index that can be used for searching.
Definition: fm_index.hpp:436
Adaptations of algorithms from the Ranges TS.
typename sdsl_index_type::size_type size_type
Type for representing positions in the indexed text.
Definition: fm_index.hpp:164
sdsl::sd_vector text_begin
Bitvector storing begin positions for collections.
Definition: fm_index.hpp:153
~fm_index()=default
Destructor.
Provides various transformation traits used by the range module.
typename reference< t >::type reference_t
Shortcut for seqan3::reference (TransformationTrait shortcut).
Definition: pre.hpp:77
Provides the seqan3::fm_index_cursor for searching in the unidirectional seqan3::fm_index.
Provides the internal representation of a node of the seqan3::fm_index_cursor.
constexpr auto alphabet_size
A type trait that holds the size of a (semi-)alphabet.
Definition: concept.hpp:678
typename sdsl_index_type::alphabet_type::char_type sdsl_char_type
The type of the reduced alphabet type. (The reduced alphabet might be smaller than the original alpha...
Definition: fm_index.hpp:144
::ranges::end end
Alias for ranges::end. Returns an iterator to the end of a range.
Definition: ranges:179
constexpr auto transform
A range adaptor that takes a invocable and returns a view of the elements with the invocable applied...
Definition: ranges:911
The SeqAn FM Index Cursor.
Definition: fm_index_cursor.hpp:64
auto const to_rank
A view that calls seqan3::to_rank() on each element in the input range.
Definition: to_rank.hpp:68
The SeqAn FM Index.
Definition: fm_index.hpp:128
This header includes C++17 filesystem support and imports it into namespace seqan3::filesystem (indep...