SeqAn3 3.4.0-rc.1
The Modern C++ library for sequence analysis.
Loading...
Searching...
No Matches
seqan3::fm_index< alphabet_t, text_layout_mode_, sdsl_index_type_ > Class Template Reference

The SeqAn FM Index. More...

#include <seqan3/search/fm_index/fm_index.hpp>

+ Inheritance diagram for seqan3::fm_index< alphabet_t, text_layout_mode_, sdsl_index_type_ >:

Public Member Functions

cursor_type cursor () const noexcept
 Returns a seqan3::fm_index_cursor on the index that can be used for searching. Cursor is pointing to the root node of the implicit suffix tree. .
 
bool empty () const noexcept
 Checks whether the index is empty.
 
bool operator!= (fm_index const &rhs) const noexcept
 Compares two indices.
 
bool operator== (fm_index const &rhs) const noexcept
 Compares two indices.
 
template<cereal_archive archive_t>
void serialize (archive_t &archive)
 Serialisation support function.
 
size_type size () const noexcept
 Returns the length of the indexed text including sentinel characters.
 
Constructors, destructor and assignment
 fm_index ()=default
 Defaulted.
 
 fm_index (fm_index const &rhs)
 When copy constructing, also update internal data structures.
 
 fm_index (fm_index &&rhs)
 When move constructing, also update internal data structures.
 
fm_indexoperator= (fm_index rhs)
 When copy/move assigning, also update internal data structures.
 
 ~fm_index ()=default
 Defaulted.
 
template<std::ranges::bidirectional_range text_t>
 fm_index (text_t &&text)
 Constructor that immediately constructs the index given a range. The range cannot be empty.
 

Static Public Attributes

static constexpr text_layout text_layout_mode = text_layout_mode_
 Indicates whether index is built over a collection.
 

Private Member Functions

template<std::ranges::range text_t>
requires (text_layout_mode_ == text_layout::single)
void construct (text_t &&text)
 Constructs the index given a range. The range cannot be an rvalue (i.e. a temporary object) and has to be non-empty.
 
template<std::ranges::range text_t>
requires (text_layout_mode_ == text_layout::collection)
void construct (text_t &&text, bool reverse=false)
 This is an overloaded member function, provided for convenience. It differs from the above function only in what argument(s) it accepts.
 

Static Private Member Functions

template<typename output_it_t , typename sequence_t >
static output_it_t copy_sequence_ranks_shifted_by_one (output_it_t output_it, sequence_t &&sequence)
 Eagerly convert sequence into ranks, shift by one and copy them into output_it.
 

Private Attributes

sdsl_index_type index
 Underlying index from the SDSL.
 
sdsl::sd_vector text_begin
 Bitvector storing begin positions for collections.
 
sdsl::rank_support_sd< 1 > text_begin_rs
 Rank support for text_begin.
 
sdsl::select_support_sd< 1 > text_begin_ss
 Select support for text_begin.
 

Friends

template<typename bi_fm_index_t >
class bi_fm_index_cursor
 
template<typename fm_index_t >
struct detail::fm_index_cursor_node
 
class detail::reverse_fm_index< alphabet_t, text_layout_mode_, sdsl_index_type_ >
 
template<typename fm_index_t >
class fm_index_cursor
 

Member types

using sdsl_index_type = sdsl_index_type_
 The type of the underlying SDSL index.
 
using sdsl_char_type = typename sdsl_index_type::alphabet_type::char_type
 The type of the reduced alphabet type. (The reduced alphabet might be smaller than the original alphabet in case not all possible characters occur in the indexed text.)
 
using sdsl_sigma_type = typename sdsl_index_type::alphabet_type::sigma_type
 The type of the alphabet size of the underlying SDSL index.
 
using alphabet_type = alphabet_t
 The type of the underlying character of the indexed text.
 
using size_type = typename sdsl_index_type::size_type
 Type for representing positions in the indexed text.
 
using cursor_type = fm_index_cursor< fm_index >
 The type of the (unidirectional) cursor.
 

Detailed Description

template<semialphabet alphabet_t, text_layout text_layout_mode_, detail::sdsl_index sdsl_index_type_ = default_sdsl_index_type>
class seqan3::fm_index< alphabet_t, text_layout_mode_, sdsl_index_type_ >

The SeqAn FM Index.

Template Parameters
alphabet_tThe alphabet type; must model seqan3::semialphabet.
text_layout_mode_Indicates whether this index works on a text collection or a single text. See seqan3::text_layout.
sdsl_index_type_The type of the underlying SDSL index, must model seqan3::sdsl_index.

The seqan3::fm_index is a fast and space-efficient string index to search strings and collections of strings.

General information

Here is a short example on how to build an index and search a pattern using an cursor. Please note that there is a very powerful search module with a high-level interface seqan3::search that encapsulates the use of cursors.

// SPDX-FileCopyrightText: 2006-2024 Knut Reinert & Freie Universität Berlin
// SPDX-FileCopyrightText: 2016-2024 Knut Reinert & MPI für molekulare Genetik
// SPDX-License-Identifier: CC0-1.0
#include <vector>
int main()
{
using namespace seqan3::literals;
std::vector<seqan3::dna4> genome{"ATCGATCGAAGGCTAGCTAGCTAAGGGA"_dna4};
seqan3::fm_index index{genome}; // build the index
auto cur = index.cursor(); // create a cursor
cur.extend_right("AAGG"_dna4); // search the pattern "AAGG"
seqan3::debug_stream << "Number of hits: " << cur.count() << '\n'; // outputs: 2
seqan3::debug_stream << "Positions in the genome: ";
for (auto && pos : cur.locate()) // outputs: (0, 8), (0, 22)
seqan3::debug_stream << pos << ' ';
return 0;
}
bool extend_right() noexcept
Tries to extend the query by the smallest possible character to the right such that the query is foun...
Definition fm_index_cursor.hpp:257
The SeqAn FM Index.
Definition fm_index.hpp:186
cursor_type cursor() const noexcept
Returns a seqan3::fm_index_cursor on the index that can be used for searching. Cursor is pointing to...
Definition fm_index.hpp:538
sdsl_index_type index
Underlying index from the SDSL.
Definition fm_index.hpp:204
Provides seqan3::debug_stream and related types.
Provides seqan3::dna4, container aliases and string literals.
debug_stream_type debug_stream
A global instance of seqan3::debug_stream_type.
Definition debug_stream.hpp:37
The SeqAn namespace for literals.
The main SeqAn3 namespace.
Definition aligned_sequence_concept.hpp:26
Meta-header for the Search / FM Index submodule .
Attention
When building an index for a single text over any alphabet, the symbol with rank 255 is reserved and may not occur in the text.

Here is an example using a collection of strings (e.g. a genome with multiple chromosomes or a protein database):

// SPDX-FileCopyrightText: 2006-2024 Knut Reinert & Freie Universität Berlin
// SPDX-FileCopyrightText: 2016-2024 Knut Reinert & MPI für molekulare Genetik
// SPDX-License-Identifier: CC0-1.0
#include <vector>
int main()
{
using namespace seqan3::literals;
std::vector<std::vector<seqan3::dna4>> genomes{"ATCTGACGAAGGCTAGCTAGCTAAGGGA"_dna4,
"TAGCTGAAGCCATTGGCATCTGATCGGACT"_dna4,
"ACTGAGCTCGTC"_dna4,
"TGCATGCACCCATCGACTGACTG"_dna4,
"GTACGTACGTTACG"_dna4};
seqan3::fm_index index{genomes}; // build the index
auto cur = index.cursor(); // create a cursor
cur.extend_right("CTGA"_dna4); // search the pattern "CTGA"
seqan3::debug_stream << "Number of hits: " << cur.count() << '\n'; // outputs: 5
seqan3::debug_stream << "Positions in the genomes: ";
for (auto && pos : cur.locate()) // outputs: (3,16) (2,1) (1,3) (0,2) (1,19)
seqan3::debug_stream << pos << ' ';
return 0;
}
Attention
When building an index for a text collection over any alphabet, the symbols with rank 254 and 255 are reserved and may not be used in the text.

Choosing an index implementation

The underlying implementation of the FM Index (rank data structure, sampling rates, etc.) can be specified by passing a new SDSL index type as second template parameter:

Todo:
Link to SDSL documentation or write our own once SDSL3 documentation is available somewhere....

Constructor & Destructor Documentation

◆ fm_index()

template<semialphabet alphabet_t, text_layout text_layout_mode_, detail::sdsl_index sdsl_index_type_ = default_sdsl_index_type>
template<std::ranges::bidirectional_range text_t>
seqan3::fm_index< alphabet_t, text_layout_mode_, sdsl_index_type_ >::fm_index ( text_t &&  text)
inlineexplicit

Constructor that immediately constructs the index given a range. The range cannot be empty.

Template Parameters
text_tThe type of range to construct from; must model std::ranges::bidirectional_range.
Parameters
[in]textThe text to construct from.

Complexity

Todo:
At least linear.

Member Function Documentation

◆ construct()

template<semialphabet alphabet_t, text_layout text_layout_mode_, detail::sdsl_index sdsl_index_type_ = default_sdsl_index_type>
template<std::ranges::range text_t>
requires (text_layout_mode_ == text_layout::single)
void seqan3::fm_index< alphabet_t, text_layout_mode_, sdsl_index_type_ >::construct ( text_t &&  text)
inlineprivate

Constructs the index given a range. The range cannot be an rvalue (i.e. a temporary object) and has to be non-empty.

Template Parameters
text_tThe type of range to construct from; must model std::ranges::bidirectional_range.
Parameters
[in]textThe text to construct from.
Todo:
This has to be better implemented with regard to the memory peak due to not matching interfaces with the SDSL.

Complexity

Todo:
At least linear.

Exceptions

No guarantee.

Todo:
Ensure strong exception guarantee.

◆ cursor()

template<semialphabet alphabet_t, text_layout text_layout_mode_, detail::sdsl_index sdsl_index_type_ = default_sdsl_index_type>
cursor_type seqan3::fm_index< alphabet_t, text_layout_mode_, sdsl_index_type_ >::cursor ( ) const
inlinenoexcept

Returns a seqan3::fm_index_cursor on the index that can be used for searching. Cursor is pointing to the root node of the implicit suffix tree. .

Returns
Returns a (unidirectional) seqan3::fm_index_cursor on the index.

Complexity

Constant.

Exceptions

No-throw guarantee.

◆ empty()

template<semialphabet alphabet_t, text_layout text_layout_mode_, detail::sdsl_index sdsl_index_type_ = default_sdsl_index_type>
bool seqan3::fm_index< alphabet_t, text_layout_mode_, sdsl_index_type_ >::empty ( ) const
inlinenoexcept

Checks whether the index is empty.

Returns
true if the index is empty, false otherwise.

Complexity

Constant.

Exceptions

No-throw guarantee.

◆ operator!=()

template<semialphabet alphabet_t, text_layout text_layout_mode_, detail::sdsl_index sdsl_index_type_ = default_sdsl_index_type>
bool seqan3::fm_index< alphabet_t, text_layout_mode_, sdsl_index_type_ >::operator!= ( fm_index< alphabet_t, text_layout_mode_, sdsl_index_type_ > const &  rhs) const
inlinenoexcept

Compares two indices.

Returns
true if the indices are unequal, false otherwise.

Complexity

Linear.

Exceptions

No-throw guarantee.

◆ operator==()

template<semialphabet alphabet_t, text_layout text_layout_mode_, detail::sdsl_index sdsl_index_type_ = default_sdsl_index_type>
bool seqan3::fm_index< alphabet_t, text_layout_mode_, sdsl_index_type_ >::operator== ( fm_index< alphabet_t, text_layout_mode_, sdsl_index_type_ > const &  rhs) const
inlinenoexcept

Compares two indices.

Returns
true if the indices are equal, false otherwise.

Complexity

Linear.

Exceptions

No-throw guarantee.

◆ serialize()

template<semialphabet alphabet_t, text_layout text_layout_mode_, detail::sdsl_index sdsl_index_type_ = default_sdsl_index_type>
template<cereal_archive archive_t>
void seqan3::fm_index< alphabet_t, text_layout_mode_, sdsl_index_type_ >::serialize ( archive_t &  archive)
inline

Serialisation support function.

Template Parameters
archive_tType of archive; must satisfy seqan3::cereal_archive.
Parameters
archiveThe archive being serialised from/to.
Attention
These functions are never called directly, see Serialisation for more details.

◆ size()

template<semialphabet alphabet_t, text_layout text_layout_mode_, detail::sdsl_index sdsl_index_type_ = default_sdsl_index_type>
size_type seqan3::fm_index< alphabet_t, text_layout_mode_, sdsl_index_type_ >::size ( ) const
inlinenoexcept

Returns the length of the indexed text including sentinel characters.

Returns
Returns the length of the indexed text including sentinel characters.

Complexity

Constant.

Exceptions

No-throw guarantee.


The documentation for this class was generated from the following file:
Hide me