SeqAn3  3.0.0
The Modern C++ library for sequence analysis.
seqan3::sequence_file_input< traits_type_, selected_field_ids_, valid_formats_, stream_char_type_ > Class Template Reference

A class for reading sequence files, e.g. FASTA, FASTQ ... More...

#include <seqan3/io/sequence_file/input.hpp>

Public Types

using field_ids = fields< field::SEQ, field::ID, field::QUAL, field::SEQ_QUAL >
 The subset of seqan3::field IDs that are valid for this file; order corresponds to the types in field_types.
 
Template arguments

Exposed as member types for public access.

using traits_type = traits_type_
 A traits type that defines aliases and template for storage of the fields.
 
using selected_field_ids = selected_field_ids_
 A seqan3::fields list with the fields selected for the record.
 
using valid_formats = valid_formats_
 A seqan3::type_list with the possible formats.
 
using stream_char_type = stream_char_type_
 Character type of the stream(s), usually char.
 
Field types and record type

These types are relevant for record/row-based reading; they may be manipulated via the traits_type to achieve different storage behaviour.

using sequence_type = typename traits_type::template sequence_container< typename traits_type::sequence_alphabet >
 The type of field::SEQ (std::vector <seqan3::dna5> by default).
 
using id_type = typename traits_type::template id_container< typename traits_type::id_alphabet >
 The type of field::ID (std::string by defaul).
 
using quality_type = typename traits_type::template quality_container< typename traits_type::quality_alphabet >
 The type of field::QUAL (std::vector <seqan3::phred42> by default).
 
using sequence_quality_type = typename traits_type::template sequence_container< qualified< typename traits_type::sequence_alphabet, typename traits_type::quality_alphabet > >
 The type of field::SEQ_QUAL (std::vector <seqan3::dna5q> by default).
 
using field_types = type_list< sequence_type, id_type, quality_type, sequence_quality_type >
 The previously defined types aggregated in a seqan3::type_list.
 
using record_type = record< detail::select_types_with_ids_t< field_types, field_ids, selected_field_ids >, selected_field_ids >
 The type of the record, a specialisation of seqan3::record; acts as a tuple of the selected field types.
 
Field column types and tuple type

These types are relevant for field/column-wise reading; they may be manipulated via the traits_type to achieve different storage behaviour.

using sequence_column_type = typename traits_type::template sequence_container_container< sequence_type >
 Column type of field::SEQ (seqan3::concatenated_sequences<sequence_type> by default).
 
using id_column_type = typename traits_type::template id_container_container< id_type >
 Column type of field::ID (seqan3::concatenated_sequences<id_type> by default).
 
using quality_column_type = typename traits_type::template quality_container_container< quality_type >
 Column type of field::QUAL (seqan3::concatenated_sequences<quality_type> by default).
 
using sequence_quality_column_type = typename traits_type::template sequence_container_container< sequence_quality_type >
 Column type of field::SEQ_QUAL (seqan3::concatenated_sequences<sequence_quality_type> by default).
 
using field_column_types = type_list< sequence_column_type, id_column_type, quality_column_type, sequence_quality_column_type >
 The previously defined types aggregated in a seqan3::type_list.
 
using file_as_tuple_type = record< detail::select_types_with_ids_t< field_column_types, field_ids, selected_field_ids >, selected_field_ids >
 The type emulated by the file when read column-wise.
 
Range associated types

The types necessary to facilitate the behaviour of an input range (used in record-wise reading).

using value_type = record_type
 The value_type is the record_type.
 
using reference = record_type &
 The reference type.
 
using const_reference = void
 The const_reference type is void, because files are not const-iterable.
 
using size_type = size_t
 An unsigned integer type, usually std::size_t.
 
using difference_type = std::make_signed_t< size_t >
 A signed integer type, usually std::ptrdiff_t.
 
using iterator = detail::in_file_iterator< sequence_file_input >
 The iterator type of this view (an input iterator).
 
using const_iterator = void
 The const iterator type is void, because files are not const-iterable.
 
using sentinel = std::ranges::default_sentinel_t
 The type returned by end().
 

Public Member Functions

Constructors, destructor and assignment
 sequence_file_input ()=delete
 Default constructor is explicitly deleted, you need to give a stream or file name.
 
 sequence_file_input (sequence_file_input const &)=delete
 Copy construction is explicitly deleted, because you can't have multiple access to the same file.
 
sequence_file_inputoperator= (sequence_file_input const &)=delete
 Copy assignment is explicitly deleted, because you can't have multiple access to the same file.
 
 sequence_file_input (sequence_file_input &&)=default
 Move construction is defaulted.
 
sequence_file_inputoperator= (sequence_file_input &&)=default
 Move assignment is defaulted.
 
 ~sequence_file_input ()=default
 Destructor is defaulted.
 
 sequence_file_input (std::filesystem::path filename, selected_field_ids const &fields_tag=selected_field_ids{})
 Construct from filename. More...
 
template<IStream2 stream_t, SequenceFileInputFormat file_format>
 sequence_file_input (stream_t &stream, file_format const &format_tag, selected_field_ids const &fields_tag=selected_field_ids{})
 Construct from an existing stream and with specified format. More...
 
template<IStream2 stream_t, SequenceFileInputFormat file_format>
 sequence_file_input (stream_t &&stream, file_format const &format_tag, selected_field_ids const &fields_tag=selected_field_ids{})
 
Range interface

Provides functions for record based reading of the file.

iterator begin () noexcept
 Returns an iterator to current position in the file. More...
 
sentinel end () noexcept
 Returns a sentinel for comparison with iterator. More...
 
reference front () noexcept
 Return the record we are currently at in the file. More...
 

Public Attributes

sequence_file_input_options< typename traits_type::sequence_legal_alphabet, selected_field_ids::contains(field::SEQ_QUAL)> options
 The options are public and its members can be set directly.
 

Friends

Tuple interface

Provides functions for field-based ("column"-based) reading.

template<field f>
auto & get (sequence_file_input &file)
 Read the entire file into internal buffers and retrieve the specified column.
 
template<field f>
auto && get (sequence_file_input &&file)
 Read the entire file into internal buffers and retrieve the specified column. More...
 
template<size_t i>
auto & get (sequence_file_input &file)
 Read the entire file into internal buffers and retrieve the specified column. More...
 
template<size_t i>
auto && get (sequence_file_input &&file)
 Read the entire file into internal buffers and retrieve the specified column. More...
 
template<typename t >
auto & get (sequence_file_input &file)
 Read the entire file into internal buffers and retrieve the specified column. More...
 
template<typename t >
auto && get (sequence_file_input &&file)
 Read the entire file into internal buffers and retrieve the specified column. More...
 

Related Functions

(Note that these are not member functions.)

Type deduction guides
template<IStream2 stream_type, SequenceFileInputFormat file_format, detail::Fields selected_field_ids>
 sequence_file_input (stream_type &&stream, file_format const &, selected_field_ids const &) -> sequence_file_input< typename sequence_file_input<>::traits_type, selected_field_ids, type_list< file_format >, typename std::remove_reference_t< stream_type >::char_type >
 Deduction of the selected fields, the file format and the stream type.
 
template<IStream2 stream_type, SequenceFileInputFormat file_format, detail::Fields selected_field_ids>
 sequence_file_input (stream_type &stream, file_format const &, selected_field_ids const &) -> sequence_file_input< typename sequence_file_input<>::traits_type, selected_field_ids, type_list< file_format >, typename std::remove_reference_t< stream_type >::char_type >
 

Detailed Description

template<SequenceFileInputTraits traits_type_ = sequence_file_input_default_traits_dna, detail::Fields selected_field_ids_ = fields<field::SEQ, field::ID, field::QUAL>, detail::TypeListOfSequenceFileInputFormats valid_formats_ = type_list<format_embl, format_fasta, format_fastq, format_genbank, format_sam>, Char stream_char_type_ = char>
class seqan3::sequence_file_input< traits_type_, selected_field_ids_, valid_formats_, stream_char_type_ >

A class for reading sequence files, e.g. FASTA, FASTQ ...

Template Parameters
traits_typeAn auxiliary type that defines certain member types and constants, must satisfy seqan3::SequenceFileInputTraits.
selected_field_idsA seqan3::fields type with the list and order of desired record entries; all fields must be in seqan3::sequence_file_input::field_ids.
valid_formatsA seqan3::type_list of the selectable formats (each must meet seqan3::SequenceFileInputFormat).
stream_char_typeThe type of the underlying stream device(s); must model seqan3::Char.

Introduction

Sequence files are the most generic and common biological files. Well-known formats include FastA and FastQ, but some may also be interested in treating SAM or BAM files as sequence files, discarding the alignment.

The Sequence file abstraction supports reading four different fields:

  1. seqan3::field::SEQ
  2. seqan3::field::ID
  3. seqan3::field::QUAL
  4. seqan3::field::SEQ_QUAL (sequence and qualities in one range)

The first three fields are retrieved by default (and in that order). The last field may be selected to have sequence and qualities directly stored in a more memory-efficient combined container. If you select the last field you may not select seqan3::field::SEQ or seqan3::field::QUAL.

Construction and specialisation

This class comes with two constructors, one for construction from a file name and one for construction from an existing stream and a known format. The first one automatically picks the format based on the extension of the file name. The second can be used if you have a non-file stream, like std::cin or std::istringstream, that you want to read from and/or if you cannot use file-extension based detection, but know that your input file has a certain format.

In most cases the template parameters are deduced completely automatically:

sequence_file_input fin{tmp_dir/"my.fasta"}; // FastA with DNA sequences assumed, regular std::ifstream taken as stream

Reading from an std::istringstream:

{
"> TEST1\n"
"ACGT\n"
"> Test2\n"
"AGGCTGN\n"
"> Test3\n"
"GGAGTATAATATATATATATATAT\n"
};
std::istringstream iss(input);
sequence_file_input fin{std::move(iss), format_fasta{}};
// ^ no need to specify the template arguments

Note that this is not the same as writing sequence_file_input<> (with angle brackets). In the latter case they are explicitly set to their default values, in the former case automatic deduction happens which chooses different parameters depending on the constructor arguments. For opening from file, sequence_file_input<> would have also worked, but for opening from stream it would not have.

In some cases, you do need to specify the arguments, e.g. if you want to read amino acids:

sequence_file_input<sequence_file_input_default_traits_aa> fin{tmp_dir/"my.fasta"};

You can define your own traits type to further customise the types used by and returned by this class, see seqan3::sequence_file_default_traits_dna for more details. As mentioned above, specifying at least one template parameter yourself means that you loose automatic deduction so if you want to read amino acids and want to read from a string stream you need to give all types yourself:

// ... input had amino acid sequences
{
"> TEST1\n"
"FQTWE\n"
"> Test2\n"
"KYRTW\n"
"> Test3\n"
"EEYQTWEEFARAAEKLYLTDPMKV\n"
};
std::istringstream iss(input);
sequence_file_input<sequence_file_input_default_traits_aa /*Use amino acid traits here*/,
fields<field::SEQ, field::ID, field::QUAL>,
type_list<format_fasta>, char> fin{iss, format_fasta{}};

Reading record-wise

You can iterate over this file record-wise:

sequence_file_input fin{tmp_dir/"my.fasta"};
for (auto & rec : fin)
{
debug_stream << "ID: " << get<field::ID>(rec) << '\n';
debug_stream << "SEQ: " << get<field::SEQ>(rec) << '\n';
// a quality field also exists, but is not printed, because we know it's empty for FastA files.
}

In the above example, rec has the type record_type which is a specialisation of seqan3::record and behaves like an std::tuple (that's why we can access it via get). Instead of using the seqan3::field based interface on the record, you could also use std::get<0> or even std::get<dna4_vector> to retrieve the sequence, but it is not recommended, because it is more error-prone.

Note: It is important to write auto & and not just auto, otherwise you will copy the record on every iteration. Since the buffer gets "refilled" on every iteration, you can also move the data out of the record if you want to store it somewhere without copying:

sequence_file_input fin{tmp_dir/"my.fasta"};
using record_type = typename decltype(fin)::record_type;
for (auto & rec : fin)
records.push_back(std::move(rec));

Reading record-wise (decomposed records)

Instead of using get on the record, you can also use structured bindings to decompose the record into its elements:

sequence_file_input fin{tmp_dir/"my.fasta"};
for (auto & [ seq, id, qual ] : fin)
{
debug_stream << "ID: " << id << '\n';
debug_stream << "SEQ: " << seq << '\n';
debug_stream << "EMPTY QUAL." << qual << '\n'; // qual is empty for FastA files
}

In this case you immediately get the two elements of the tuple: seq of sequence_type and id of id_type. But beware: with structured bindings you do need to get the order of elements correctly!

Reading record-wise (custom fields)

If you want to skip specific fields from the record you can pass a non-empty fields trait object to the sequence_file_input constructor to select the fields that should be read from the input. For example to choose a combined field for SEQ and QUAL (see above). Or to never actually read the QUAL, if you don't need it. The following snippets demonstrate the usage of such a fields trait object.

sequence_file_input fin{tmp_dir/"my.fastq", fields<field::ID, field::SEQ_QUAL>{}};
for (auto & [ id, seq_qual ] : fin) // the order is now different, "id" comes first, because it was specified first
{
debug_stream << "ID: " << id << '\n';
// sequence and qualities are part of the same vector, of type std::vector<dna5q>
debug_stream << "SEQ: " << (seq_qual | view::get<0>) << '\n'; // sequence string is extracted
debug_stream << "QUAL: " << (seq_qual | view::get<1>) << '\n'; // quality string is extracted
}

When reading a file, all fields not present in the file (but requested implicitly or via the selected_field_ids parameter) are ignored.

Views on files

Since SeqAn files are ranges, you can also create views over files. A useful example is to filter the records based on certain criteria, e.g. minimum length of the sequence field:

sequence_file_input fin{tmp_dir/"my.fasta"};
auto minimum_length5_filter = std::view::filter([] (auto const & rec)
{
return std::ranges::size(get<field::SEQ>(rec)) >= 5;
});
for (auto & rec : fin | minimum_length5_filter) // only record with sequence length >= 5 will "appear"
{
debug_stream << "IDs of seq_length >= 5: " << get<field::ID>(rec) << '\n';
// ...
}

End of file

You can check whether a file is at end by comparing begin() and end() (if they are the same, the file is at end).

Column-based reading

The record-based interface treats the file as a range of tuples (the records), but in certain situations it is desirable to read the file by field, i.e. column wise (tuple-of-ranges, instead of range-of-tuples).

This interface is less flexible, but can save you copy operations in certain scenarios, given that you have sufficient memory to load the entire file at once:

struct data_storage_t
{
concatenated_sequences<dna5_vector> sequences;
concatenated_sequences<std::string> ids;
};
data_storage_t data_storage; // a global or globally used variable in your program
// ... in your file reading function:
sequence_file_input fin{tmp_dir/"my.fasta"};
data_storage.sequences = std::move(get<field::SEQ>(fin)); // we move the buffer directly into our storage
data_storage.ids = std::move(get<field::ID>(fin)); // we move the buffer directly into our storage

Note that for this to make sense, your storage data types need to be identical to the corresponding column types of the file. If you require different column types you can specify you own traits, see seqan3::SequenceFileInputTraits.

Formats

We currently support reading the following formats:

Constructor & Destructor Documentation

◆ sequence_file_input() [1/3]

template<SequenceFileInputTraits traits_type_ = sequence_file_input_default_traits_dna, detail::Fields selected_field_ids_ = fields<field::SEQ, field::ID, field::QUAL>, detail::TypeListOfSequenceFileInputFormats valid_formats_ = type_list<format_embl, format_fasta, format_fastq, format_genbank, format_sam>, Char stream_char_type_ = char>
seqan3::sequence_file_input< traits_type_, selected_field_ids_, valid_formats_, stream_char_type_ >::sequence_file_input ( std::filesystem::path  filename,
selected_field_ids const &  fields_tag = selected_field_ids{} 
)
inline

Construct from filename.

Parameters
[in]filenamePath to the file you wish to open.
[in]fields_tagA seqan3::fields tag. [optional]
Exceptions
seqan3::file_open_errorIf the file could not be opened, e.g. non-existant, non-readable, unknown format.

In addition to the file name, you may specify a custom seqan3::fields type which may be easier than defining all the template parameters.

Decompression

This constructor transparently applies a decompression stream on top of the file stream in case the file is detected as being compressed. See the section on compression and decompression for more information.

◆ sequence_file_input() [2/3]

template<SequenceFileInputTraits traits_type_ = sequence_file_input_default_traits_dna, detail::Fields selected_field_ids_ = fields<field::SEQ, field::ID, field::QUAL>, detail::TypeListOfSequenceFileInputFormats valid_formats_ = type_list<format_embl, format_fasta, format_fastq, format_genbank, format_sam>, Char stream_char_type_ = char>
template<IStream2 stream_t, SequenceFileInputFormat file_format>
seqan3::sequence_file_input< traits_type_, selected_field_ids_, valid_formats_, stream_char_type_ >::sequence_file_input ( stream_t &  stream,
file_format const &  format_tag,
selected_field_ids const &  fields_tag = selected_field_ids{} 
)
inline

Construct from an existing stream and with specified format.

Template Parameters
file_formatThe format of the file in the stream, must satisfy seqan3::SequenceFileInputFormat.
Parameters
[in]streamThe stream to operate on; must be derived of std::basic_istream.
[in]format_tagThe file format tag.
[in]fields_tagA seqan3::fields tag. [optional]

Decompression

This constructor transparently applies a decompression stream on top of the stream in case it is detected as being compressed. See the section on compression and decompression for more information.

◆ sequence_file_input() [3/3]

template<SequenceFileInputTraits traits_type_ = sequence_file_input_default_traits_dna, detail::Fields selected_field_ids_ = fields<field::SEQ, field::ID, field::QUAL>, detail::TypeListOfSequenceFileInputFormats valid_formats_ = type_list<format_embl, format_fasta, format_fastq, format_genbank, format_sam>, Char stream_char_type_ = char>
template<IStream2 stream_t, SequenceFileInputFormat file_format>
seqan3::sequence_file_input< traits_type_, selected_field_ids_, valid_formats_, stream_char_type_ >::sequence_file_input ( stream_t &&  stream,
file_format const &  format_tag,
selected_field_ids const &  fields_tag = selected_field_ids{} 
)
inline

This is an overloaded member function, provided for convenience. It differs from the above function only in what argument(s) it accepts.

Member Function Documentation

◆ begin()

template<SequenceFileInputTraits traits_type_ = sequence_file_input_default_traits_dna, detail::Fields selected_field_ids_ = fields<field::SEQ, field::ID, field::QUAL>, detail::TypeListOfSequenceFileInputFormats valid_formats_ = type_list<format_embl, format_fasta, format_fastq, format_genbank, format_sam>, Char stream_char_type_ = char>
iterator seqan3::sequence_file_input< traits_type_, selected_field_ids_, valid_formats_, stream_char_type_ >::begin ( )
inlinenoexcept

Returns an iterator to current position in the file.

Returns
An iterator pointing to the current position in the file.

Equals end() if the file is at end.

Complexity

Constant.

Exceptions

No-throw guarantee.

◆ end()

template<SequenceFileInputTraits traits_type_ = sequence_file_input_default_traits_dna, detail::Fields selected_field_ids_ = fields<field::SEQ, field::ID, field::QUAL>, detail::TypeListOfSequenceFileInputFormats valid_formats_ = type_list<format_embl, format_fasta, format_fastq, format_genbank, format_sam>, Char stream_char_type_ = char>
sentinel seqan3::sequence_file_input< traits_type_, selected_field_ids_, valid_formats_, stream_char_type_ >::end ( )
inlinenoexcept

Returns a sentinel for comparison with iterator.

Returns
Iterator to the first element.

This element acts as a placeholder; attempting to dereference it results in undefined behaviour.

Complexity

Constant.

Exceptions

No-throw guarantee.

◆ front()

template<SequenceFileInputTraits traits_type_ = sequence_file_input_default_traits_dna, detail::Fields selected_field_ids_ = fields<field::SEQ, field::ID, field::QUAL>, detail::TypeListOfSequenceFileInputFormats valid_formats_ = type_list<format_embl, format_fasta, format_fastq, format_genbank, format_sam>, Char stream_char_type_ = char>
reference seqan3::sequence_file_input< traits_type_, selected_field_ids_, valid_formats_, stream_char_type_ >::front ( )
inlinenoexcept

Return the record we are currently at in the file.

Returns
A reference to the currently buffered record.

This function returns a reference to the currently buffered record, it is identical to dereferencing begin(), but begin also always points to the current record on single pass input ranges:

sequence_file_input fin{tmp_dir/"my.fasta"};
auto it = begin(fin);
// the following are equivalent:
auto & rec0 = *it;
auto & rec1 = fin.front();
// Note: rec0 and rec1 are references and become invalid after incrementing "it"!

It most situations using the iterator interface or a range-based for-loop are preferable to using front(), because you can only move to the next record via the iterator.

In any case, don't forget the reference! If you want to save the data from the record elsewhere, use move:

sequence_file_input fin{tmp_dir/"my.fasta"};
auto rec0 = std::move(fin.front());

Complexity

Constant.

Exceptions

No-throw guarantee.

Friends And Related Function Documentation

◆ get [1/5]

template<SequenceFileInputTraits traits_type_ = sequence_file_input_default_traits_dna, detail::Fields selected_field_ids_ = fields<field::SEQ, field::ID, field::QUAL>, detail::TypeListOfSequenceFileInputFormats valid_formats_ = type_list<format_embl, format_fasta, format_fastq, format_genbank, format_sam>, Char stream_char_type_ = char>
template<field f>
auto&& get ( sequence_file_input< traits_type_, selected_field_ids_, valid_formats_, stream_char_type_ > &&  file)
friend

Read the entire file into internal buffers and retrieve the specified column.

◆ get [2/5]

template<SequenceFileInputTraits traits_type_ = sequence_file_input_default_traits_dna, detail::Fields selected_field_ids_ = fields<field::SEQ, field::ID, field::QUAL>, detail::TypeListOfSequenceFileInputFormats valid_formats_ = type_list<format_embl, format_fasta, format_fastq, format_genbank, format_sam>, Char stream_char_type_ = char>
template<size_t i>
auto& get ( sequence_file_input< traits_type_, selected_field_ids_, valid_formats_, stream_char_type_ > &  file)
friend

Read the entire file into internal buffers and retrieve the specified column.

◆ get [3/5]

template<SequenceFileInputTraits traits_type_ = sequence_file_input_default_traits_dna, detail::Fields selected_field_ids_ = fields<field::SEQ, field::ID, field::QUAL>, detail::TypeListOfSequenceFileInputFormats valid_formats_ = type_list<format_embl, format_fasta, format_fastq, format_genbank, format_sam>, Char stream_char_type_ = char>
template<size_t i>
auto&& get ( sequence_file_input< traits_type_, selected_field_ids_, valid_formats_, stream_char_type_ > &&  file)
friend

Read the entire file into internal buffers and retrieve the specified column.

◆ get [4/5]

template<SequenceFileInputTraits traits_type_ = sequence_file_input_default_traits_dna, detail::Fields selected_field_ids_ = fields<field::SEQ, field::ID, field::QUAL>, detail::TypeListOfSequenceFileInputFormats valid_formats_ = type_list<format_embl, format_fasta, format_fastq, format_genbank, format_sam>, Char stream_char_type_ = char>
template<typename t >
auto& get ( sequence_file_input< traits_type_, selected_field_ids_, valid_formats_, stream_char_type_ > &  file)
friend

Read the entire file into internal buffers and retrieve the specified column.

◆ get [5/5]

template<SequenceFileInputTraits traits_type_ = sequence_file_input_default_traits_dna, detail::Fields selected_field_ids_ = fields<field::SEQ, field::ID, field::QUAL>, detail::TypeListOfSequenceFileInputFormats valid_formats_ = type_list<format_embl, format_fasta, format_fastq, format_genbank, format_sam>, Char stream_char_type_ = char>
template<typename t >
auto&& get ( sequence_file_input< traits_type_, selected_field_ids_, valid_formats_, stream_char_type_ > &&  file)
friend

Read the entire file into internal buffers and retrieve the specified column.

◆ sequence_file_input()

template<IStream2 stream_type, SequenceFileInputFormat file_format, detail::Fields selected_field_ids>
sequence_file_input ( stream_type &  stream,
file_format const &  ,
selected_field_ids const &   
) -> sequence_file_input< typename sequence_file_input<>::traits_type, selected_field_ids, type_list< file_format >, typename std::remove_reference_t< stream_type >::char_type >
related

This is an overloaded member function, provided for convenience. It differs from the above function only in what argument(s) it accepts.


The documentation for this class was generated from the following file: